code.delx.au - gnu-emacs/blob - src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   const unsigned char *src = coding->source;
 161   const unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   const unsigned char *src = coding->source + coding->consumed;
 209   const unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   const unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf + coding->charbuf_used;
 216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317 Lisp_Object QCascii_compatible_p;
 318
 319 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 Lisp_Object Qtarget_idx;
 323
 324 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 int coding_system_require_warning;
 328
 329 Lisp_Object Vselect_safe_coding_system_function;
 330
 331 /* Mnemonic string for each format of end-of-line.  */
 332 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 333 /* Mnemonic string to indicate format of end-of-line is not yet
 334    decided.  */
 335 Lisp_Object eol_mnemonic_undecided;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 340
 341 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 342
 343 /* Coding system emacs-mule and raw-text are for converting only
 344    end-of-line format.  */
 345 Lisp_Object Qemacs_mule, Qraw_text;
 346 Lisp_Object Qutf_8_emacs;
 347
 348 /* Coding-systems are handed between Emacs Lisp programs and C internal
 349    routines by the following three variables.  */
 350 /* Coding-system for reading files and receiving data from process.  */
 351 Lisp_Object Vcoding_system_for_read;
 352 /* Coding-system for writing files and sending data to process.  */
 353 Lisp_Object Vcoding_system_for_write;
 354 /* Coding-system actually used in the latest I/O.  */
 355 Lisp_Object Vlast_coding_system_used;
 356 /* Set to non-nil when an error is detected while code conversion.  */
 357 Lisp_Object Vlast_code_conversion_error;
 358 /* A vector of length 256 which contains information about special
 359    Latin codes (especially for dealing with Microsoft codes).  */
 360 Lisp_Object Vlatin_extra_code_table;
 361
 362 /* Flag to inhibit code conversion of end-of-line format.  */
 363 int inhibit_eol_conversion;
 364
 365 /* Flag to inhibit ISO2022 escape sequence detection.  */
 366 int inhibit_iso_escape_detection;
 367
 368 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 369 int inherit_process_coding_system;
 370
 371 /* Coding system to be used to encode text for terminal display.  */
 372 struct coding_system terminal_coding;
 373
 374 /* Coding system to be used to encode text for terminal display when
 375    terminal coding system is nil.  */
 376 struct coding_system safe_terminal_coding;
 377
 378 /* Coding system of what is sent from terminal keyboard.  */
 379 struct coding_system keyboard_coding;
 380
 381 Lisp_Object Vfile_coding_system_alist;
 382 Lisp_Object Vprocess_coding_system_alist;
 383 Lisp_Object Vnetwork_coding_system_alist;
 384
 385 Lisp_Object Vlocale_coding_system;
 386
 387 #endif /* emacs */
 388
 389 /* Flag to tell if we look up translation table on character code
 390    conversion.  */
 391 Lisp_Object Venable_character_translation;
 392 /* Standard translation table to look up on decoding (reading).  */
 393 Lisp_Object Vstandard_translation_table_for_decode;
 394 /* Standard translation table to look up on encoding (writing).  */
 395 Lisp_Object Vstandard_translation_table_for_encode;
 396
 397 Lisp_Object Qtranslation_table;
 398 Lisp_Object Qtranslation_table_id;
 399 Lisp_Object Qtranslation_table_for_decode;
 400 Lisp_Object Qtranslation_table_for_encode;
 401
 402 /* Alist of charsets vs revision number.  */
 403 static Lisp_Object Vcharset_revision_table;
 404
 405 /* Default coding systems used for process I/O.  */
 406 Lisp_Object Vdefault_process_coding_system;
 407
 408 /* Char table for translating Quail and self-inserting input.  */
 409 Lisp_Object Vtranslation_table_for_input;
 410
 411 /* Two special coding systems.  */
 412 Lisp_Object Vsjis_coding_system;
 413 Lisp_Object Vbig5_coding_system;
 414
 415 /* ISO2022 section */
 416
 417 #define CODING_ISO_INITIAL(coding, reg)                 \
 418   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 419                      coding_attr_iso_initial),          \
 420                reg)))
 421
 422
 423 #define CODING_ISO_REQUEST(coding, charset_id)  \
 424   ((charset_id <= (coding)->max_charset_id      \
 425     ? (coding)->safe_charsets[charset_id]       \
 426     : -1))
 427
 428
 429 #define CODING_ISO_FLAGS(coding)        \
 430   ((coding)->spec.iso_2022.flags)
 431 #define CODING_ISO_DESIGNATION(coding, reg)     \
 432   ((coding)->spec.iso_2022.current_designation[reg])
 433 #define CODING_ISO_INVOCATION(coding, plane)    \
 434   ((coding)->spec.iso_2022.current_invocation[plane])
 435 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 436   ((coding)->spec.iso_2022.single_shifting)
 437 #define CODING_ISO_BOL(coding)  \
 438   ((coding)->spec.iso_2022.bol)
 439 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 440   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 441
 442 /* Control characters of ISO2022.  */
 443                         /* code */      /* function */
 444 #define ISO_CODE_LF     0x0A            /* line-feed */
 445 #define ISO_CODE_CR     0x0D            /* carriage-return */
 446 #define ISO_CODE_SO     0x0E            /* shift-out */
 447 #define ISO_CODE_SI     0x0F            /* shift-in */
 448 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 449 #define ISO_CODE_ESC    0x1B            /* escape */
 450 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 451 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 452 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 453
 454 /* All code (1-byte) of ISO2022 is classified into one of the
 455    followings.  */
 456 enum iso_code_class_type
 457   {
 458     ISO_control_0,              /* Control codes in the range
 459                                    0x00..0x1F and 0x7F, except for the
 460                                    following 5 codes.  */
 461     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 462     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 463     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 464     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 465     ISO_control_1,              /* Control codes in the range
 466                                    0x80..0x9F, except for the
 467                                    following 3 codes.  */
 468     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 469     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 470     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 471     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 472     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 473     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 474     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 475   };
 476
 477 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 478     `iso-flags' attribute of an iso2022 coding system.  */
 479
 480 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 481    instead of the correct short-form sequence (e.g. ESC $ A).  */
 482 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 483
 484 /* If set, reset graphic planes and registers at end-of-line to the
 485    initial state.  */
 486 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 487
 488 /* If set, reset graphic planes and registers before any control
 489    characters to the initial state.  */
 490 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 491
 492 /* If set, encode by 7-bit environment.  */
 493 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 494
 495 /* If set, use locking-shift function.  */
 496 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 497
 498 /* If set, use single-shift function.  Overwrite
 499    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 500 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 501
 502 /* If set, use designation escape sequence.  */
 503 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 504
 505 /* If set, produce revision number sequence.  */
 506 #define CODING_ISO_FLAG_REVISION        0x0080
 507
 508 /* If set, produce ISO6429's direction specifying sequence.  */
 509 #define CODING_ISO_FLAG_DIRECTION       0x0100
 510
 511 /* If set, assume designation states are reset at beginning of line on
 512    output.  */
 513 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 514
 515 /* If set, designation sequence should be placed at beginning of line
 516    on output.  */
 517 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 518
 519 /* If set, do not encode unsafe charactes on output.  */
 520 #define CODING_ISO_FLAG_SAFE            0x0800
 521
 522 /* If set, extra latin codes (128..159) are accepted as a valid code
 523    on input.  */
 524 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 525
 526 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 527
 528 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 529
 530 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 531
 532 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 533
 534 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 535
 536 /* A character to be produced on output if encoding of the original
 537    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 538 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 539
 540
 541 /* UTF-16 section */
 542 #define CODING_UTF_16_BOM(coding)       \
 543   ((coding)->spec.utf_16.bom)
 544
 545 #define CODING_UTF_16_ENDIAN(coding)    \
 546   ((coding)->spec.utf_16.endian)
 547
 548 #define CODING_UTF_16_SURROGATE(coding) \
 549   ((coding)->spec.utf_16.surrogate)
 550
 551
 552 /* CCL section */
 553 #define CODING_CCL_DECODER(coding)      \
 554   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 555 #define CODING_CCL_ENCODER(coding)      \
 556   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 557 #define CODING_CCL_VALIDS(coding)                                          \
 558   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 559
 560 /* Index for each coding category in `coding_categories' */
 561
 562 enum coding_category
 563   {
 564     coding_category_iso_7,
 565     coding_category_iso_7_tight,
 566     coding_category_iso_8_1,
 567     coding_category_iso_8_2,
 568     coding_category_iso_7_else,
 569     coding_category_iso_8_else,
 570     coding_category_utf_8,
 571     coding_category_utf_16_auto,
 572     coding_category_utf_16_be,
 573     coding_category_utf_16_le,
 574     coding_category_utf_16_be_nosig,
 575     coding_category_utf_16_le_nosig,
 576     coding_category_charset,
 577     coding_category_sjis,
 578     coding_category_big5,
 579     coding_category_ccl,
 580     coding_category_emacs_mule,
 581     /* All above are targets of code detection.  */
 582     coding_category_raw_text,
 583     coding_category_undecided,
 584     coding_category_max
 585   };
 586
 587 /* Definitions of flag bits used in detect_coding_XXXX.  */
 588 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 589 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 590 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 591 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 592 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 593 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 594 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 595 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 596 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 597 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 598 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 599 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 600 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 601 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 602 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 603 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 604 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 605 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 606
 607 /* This value is returned if detect_coding_mask () find nothing other
 608    than ASCII characters.  */
 609 #define CATEGORY_MASK_ANY               \
 610   (CATEGORY_MASK_ISO_7                  \
 611    | CATEGORY_MASK_ISO_7_TIGHT          \
 612    | CATEGORY_MASK_ISO_8_1              \
 613    | CATEGORY_MASK_ISO_8_2              \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE           \
 616    | CATEGORY_MASK_UTF_8                \
 617    | CATEGORY_MASK_UTF_16_BE            \
 618    | CATEGORY_MASK_UTF_16_LE            \
 619    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 620    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 621    | CATEGORY_MASK_CHARSET              \
 622    | CATEGORY_MASK_SJIS                 \
 623    | CATEGORY_MASK_BIG5                 \
 624    | CATEGORY_MASK_CCL                  \
 625    | CATEGORY_MASK_EMACS_MULE)
 626
 627
 628 #define CATEGORY_MASK_ISO_7BIT \
 629   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 630
 631 #define CATEGORY_MASK_ISO_8BIT \
 632   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 633
 634 #define CATEGORY_MASK_ISO_ELSE \
 635   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 636
 637 #define CATEGORY_MASK_ISO_ESCAPE        \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_7_ELSE           \
 641    | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO       \
 644   (  CATEGORY_MASK_ISO_7BIT     \
 645      | CATEGORY_MASK_ISO_8BIT   \
 646      | CATEGORY_MASK_ISO_ELSE)
 647
 648 #define CATEGORY_MASK_UTF_16            \
 649   (CATEGORY_MASK_UTF_16_BE              \
 650    | CATEGORY_MASK_UTF_16_LE            \
 651    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 652    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 653
 654
 655 /* List of symbols `coding-category-xxx' ordered by priority.  This
 656    variable is exposed to Emacs Lisp.  */
 657 static Lisp_Object Vcoding_category_list;
 658
 659 /* Table of coding categories (Lisp symbols).  This variable is for
 660    internal use oly.  */
 661 static Lisp_Object Vcoding_category_table;
 662
 663 /* Table of coding-categories ordered by priority.  */
 664 static enum coding_category coding_priorities[coding_category_max];
 665
 666 /* Nth element is a coding context for the coding system bound to the
 667    Nth coding category.  */
 668 static struct coding_system coding_categories[coding_category_max];
 669
 670 /*** Commonly used macros and functions ***/
 671
 672 #ifndef min
 673 #define min(a, b) ((a) < (b) ? (a) : (b))
 674 #endif
 675 #ifndef max
 676 #define max(a, b) ((a) > (b) ? (a) : (b))
 677 #endif
 678
 679 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 680   do {                                                  \
 681     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 682     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 683   } while (0)
 684
 685
 686 /* Safely get one byte from the source text pointed by SRC which ends
 687    at SRC_END, and set C to that byte.  If there are not enough bytes
 688    in the source, it jumps to `no_more_source'.  If multibytep is
 689    nonzero, and a multibyte character is found at SRC, set C to the
 690    negative value of the character code.  The caller should declare
 691    and set these variables appropriately in advance:
 692         src, src_end, multibytep */
 693
 694 #define ONE_MORE_BYTE(c)                                \
 695   do {                                                  \
 696     if (src == src_end)                                 \
 697       {                                                 \
 698         if (src_base < src)                             \
 699           record_conversion_result                      \
 700             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 701         goto no_more_source;                            \
 702       }                                                 \
 703     c = *src++;                                         \
 704     if (multibytep && (c & 0x80))                       \
 705       {                                                 \
 706         if ((c & 0xFE) == 0xC0)                         \
 707           c = ((c & 1) << 6) | *src++;                  \
 708         else                                            \
 709           {                                             \
 710             src--;                                      \
 711             c = - string_char (src, &src, NULL);        \
 712             record_conversion_result                    \
 713               (coding, CODING_RESULT_INVALID_SRC);      \
 714           }                                             \
 715       }                                                 \
 716     consumed_chars++;                                   \
 717   } while (0)
 718
 719
 720 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 721   do {                                                  \
 722     c = *src++;                                         \
 723     if (multibytep && (c & 0x80))                       \
 724       {                                                 \
 725         if ((c & 0xFE) == 0xC0)                         \
 726           c = ((c & 1) << 6) | *src++;                  \
 727         else                                            \
 728           {                                             \
 729             src--;                                      \
 730             c = - string_char (src, &src, NULL);        \
 731             record_conversion_result                    \
 732               (coding, CODING_RESULT_INVALID_SRC);      \
 733           }                                             \
 734       }                                                 \
 735     consumed_chars++;                                   \
 736   } while (0)
 737
 738
 739 /* Store a byte C in the place pointed by DST and increment DST to the
 740    next free point, and increment PRODUCED_CHARS.  The caller should
 741    assure that C is 0..127, and declare and set the variable `dst'
 742    appropriately in advance.
 743 */
 744
 745
 746 #define EMIT_ONE_ASCII_BYTE(c)  \
 747   do {                          \
 748     produced_chars++;           \
 749     *dst++ = (c);               \
 750   } while (0)
 751
 752
 753 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 754
 755 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 756   do {                                  \
 757     produced_chars += 2;                \
 758     *dst++ = (c1), *dst++ = (c2);       \
 759   } while (0)
 760
 761
 762 /* Store a byte C in the place pointed by DST and increment DST to the
 763    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 764    nonzero, store in an appropriate multibyte from.  The caller should
 765    declare and set the variables `dst' and `multibytep' appropriately
 766    in advance.  */
 767
 768 #define EMIT_ONE_BYTE(c)                \
 769   do {                                  \
 770     produced_chars++;                   \
 771     if (multibytep)                     \
 772       {                                 \
 773         int ch = (c);                   \
 774         if (ch >= 0x80)                 \
 775           ch = BYTE8_TO_CHAR (ch);      \
 776         CHAR_STRING_ADVANCE (ch, dst);  \
 777       }                                 \
 778     else                                \
 779       *dst++ = (c);                     \
 780   } while (0)
 781
 782
 783 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 784
 785 #define EMIT_TWO_BYTES(c1, c2)          \
 786   do {                                  \
 787     produced_chars += 2;                \
 788     if (multibytep)                     \
 789       {                                 \
 790         int ch;                         \
 791                                         \
 792         ch = (c1);                      \
 793         if (ch >= 0x80)                 \
 794           ch = BYTE8_TO_CHAR (ch);      \
 795         CHAR_STRING_ADVANCE (ch, dst);  \
 796         ch = (c2);                      \
 797         if (ch >= 0x80)                 \
 798           ch = BYTE8_TO_CHAR (ch);      \
 799         CHAR_STRING_ADVANCE (ch, dst);  \
 800       }                                 \
 801     else                                \
 802       {                                 \
 803         *dst++ = (c1);                  \
 804         *dst++ = (c2);                  \
 805       }                                 \
 806   } while (0)
 807
 808
 809 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 810   do {                                  \
 811     EMIT_ONE_BYTE (c1);                 \
 812     EMIT_TWO_BYTES (c2, c3);            \
 813   } while (0)
 814
 815
 816 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 817   do {                                          \
 818     EMIT_TWO_BYTES (c1, c2);                    \
 819     EMIT_TWO_BYTES (c3, c4);                    \
 820   } while (0)
 821
 822
 823 /* Prototypes for static functions.  */
 824 static void record_conversion_result P_ ((struct coding_system *coding,
 825                                           enum coding_result_code result));
 826 static int detect_coding_utf_8 P_ ((struct coding_system *,
 827                                     struct coding_detection_info *info));
 828 static void decode_coding_utf_8 P_ ((struct coding_system *));
 829 static int encode_coding_utf_8 P_ ((struct coding_system *));
 830
 831 static int detect_coding_utf_16 P_ ((struct coding_system *,
 832                                      struct coding_detection_info *info));
 833 static void decode_coding_utf_16 P_ ((struct coding_system *));
 834 static int encode_coding_utf_16 P_ ((struct coding_system *));
 835
 836 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 837                                        struct coding_detection_info *info));
 838 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 839 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 840
 841 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 842                                          struct coding_detection_info *info));
 843 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 844 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 845
 846 static int detect_coding_sjis P_ ((struct coding_system *,
 847                                    struct coding_detection_info *info));
 848 static void decode_coding_sjis P_ ((struct coding_system *));
 849 static int encode_coding_sjis P_ ((struct coding_system *));
 850
 851 static int detect_coding_big5 P_ ((struct coding_system *,
 852                                    struct coding_detection_info *info));
 853 static void decode_coding_big5 P_ ((struct coding_system *));
 854 static int encode_coding_big5 P_ ((struct coding_system *));
 855
 856 static int detect_coding_ccl P_ ((struct coding_system *,
 857                                   struct coding_detection_info *info));
 858 static void decode_coding_ccl P_ ((struct coding_system *));
 859 static int encode_coding_ccl P_ ((struct coding_system *));
 860
 861 static void decode_coding_raw_text P_ ((struct coding_system *));
 862 static int encode_coding_raw_text P_ ((struct coding_system *));
 863
 864 static void coding_set_source P_ ((struct coding_system *));
 865 static void coding_set_destination P_ ((struct coding_system *));
 866 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 867 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 868                                             EMACS_INT));
 869 static unsigned char *alloc_destination P_ ((struct coding_system *,
 870                                              EMACS_INT, unsigned char *));
 871 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 872 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 873                                                      int *, int *,
 874                                                      unsigned char *));
 875 static int detect_eol P_ ((const unsigned char *,
 876                            EMACS_INT, enum coding_category));
 877 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 878 static void decode_eol P_ ((struct coding_system *));
 879 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 880 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 881                                         int, int *, int *));
 882 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 883 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 884                                             EMACS_INT));
 885 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 886                                         EMACS_INT));
 887 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 888 static int decode_coding P_ ((struct coding_system *));
 889 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 890                                                       struct coding_system *,
 891                                                       int *, EMACS_INT *));
 892 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 893                                                   struct coding_system *,
 894                                                   int *, EMACS_INT *));
 895 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 896 static int encode_coding P_ ((struct coding_system *));
 897 static Lisp_Object make_conversion_work_buffer P_ ((int));
 898 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 899 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 900 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 901
 902 static void
 903 record_conversion_result (struct coding_system *coding,
 904                           enum coding_result_code result)
 905 {
 906   coding->result = result;
 907   switch (result)
 908     {
 909     case CODING_RESULT_INSUFFICIENT_SRC:
 910       Vlast_code_conversion_error = Qinsufficient_source;
 911       break;
 912     case CODING_RESULT_INCONSISTENT_EOL:
 913       Vlast_code_conversion_error = Qinconsistent_eol;
 914       break;
 915     case CODING_RESULT_INVALID_SRC:
 916       Vlast_code_conversion_error = Qinvalid_source;
 917       break;
 918     case CODING_RESULT_INTERRUPT:
 919       Vlast_code_conversion_error = Qinterrupted;
 920       break;
 921     case CODING_RESULT_INSUFFICIENT_MEM:
 922       Vlast_code_conversion_error = Qinsufficient_memory;
 923       break;
 924     default:
 925       Vlast_code_conversion_error = intern ("Unknown error");
 926     }
 927 }
 928
 929 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 930   do {                                                                       \
 931     charset_map_loaded = 0;                                                  \
 932     c = DECODE_CHAR (charset, code);                                         \
 933     if (charset_map_loaded)                                                  \
 934       {                                                                      \
 935         const unsigned char *orig = coding->source;                          \
 936         EMACS_INT offset;                                                    \
 937                                                                              \
 938         coding_set_source (coding);                                          \
 939         offset = coding->source - orig;                                      \
 940         src += offset;                                                       \
 941         src_base += offset;                                                  \
 942         src_end += offset;                                                   \
 943       }                                                                      \
 944   } while (0)
 945
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         int more_bytes = charbuf_end - charbuf + (bytes);       \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959
 960 static void
 961 coding_set_source (coding)
 962      struct coding_system *coding;
 963 {
 964   if (BUFFERP (coding->src_object))
 965     {
 966       struct buffer *buf = XBUFFER (coding->src_object);
 967
 968       if (coding->src_pos < 0)
 969         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 970       else
 971         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 972     }
 973   else if (STRINGP (coding->src_object))
 974     {
 975       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 976     }
 977   else
 978     /* Otherwise, the source is C string and is never relocated
 979        automatically.  Thus we don't have to update anything.  */
 980     ;
 981 }
 982
 983 static void
 984 coding_set_destination (coding)
 985      struct coding_system *coding;
 986 {
 987   if (BUFFERP (coding->dst_object))
 988     {
 989       if (coding->src_pos < 0)
 990         {
 991           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 992           coding->dst_bytes = (GAP_END_ADDR
 993                                - (coding->src_bytes - coding->consumed)
 994                                - coding->destination);
 995         }
 996       else
 997         {
 998           /* We are sure that coding->dst_pos_byte is before the gap
 999              of the buffer. */
1000           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1001                                  + coding->dst_pos_byte - 1);
1002           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1003                                - coding->destination);
1004         }
1005     }
1006   else
1007     /* Otherwise, the destination is C string and is never relocated
1008        automatically.  Thus we don't have to update anything.  */
1009     ;
1010 }
1011
1012
1013 static void
1014 coding_alloc_by_realloc (coding, bytes)
1015      struct coding_system *coding;
1016      EMACS_INT bytes;
1017 {
1018   coding->destination = (unsigned char *) xrealloc (coding->destination,
1019                                                     coding->dst_bytes + bytes);
1020   coding->dst_bytes += bytes;
1021 }
1022
1023 static void
1024 coding_alloc_by_making_gap (coding, bytes)
1025      struct coding_system *coding;
1026      EMACS_INT bytes;
1027 {
1028   if (BUFFERP (coding->dst_object)
1029       && EQ (coding->src_object, coding->dst_object))
1030     {
1031       EMACS_INT add = coding->src_bytes - coding->consumed;
1032
1033       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1034       make_gap (bytes);
1035       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1036     }
1037   else
1038     {
1039       Lisp_Object this_buffer;
1040
1041       this_buffer = Fcurrent_buffer ();
1042       set_buffer_internal (XBUFFER (coding->dst_object));
1043       make_gap (bytes);
1044       set_buffer_internal (XBUFFER (this_buffer));
1045     }
1046 }
1047
1048
1049 static unsigned char *
1050 alloc_destination (coding, nbytes, dst)
1051      struct coding_system *coding;
1052      EMACS_INT nbytes;
1053      unsigned char *dst;
1054 {
1055   EMACS_INT offset = dst - coding->destination;
1056
1057   if (BUFFERP (coding->dst_object))
1058     coding_alloc_by_making_gap (coding, nbytes);
1059   else
1060     coding_alloc_by_realloc (coding, nbytes);
1061   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1062   coding_set_destination (coding);
1063   dst = coding->destination + offset;
1064   return dst;
1065 }
1066
1067 /** Macros for annotations.  */
1068
1069 /* Maximum length of annotation data (sum of annotations for
1070    composition and charset).  */
1071 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1072
1073 /* An annotation data is stored in the array coding->charbuf in this
1074    format:
1075      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1076    LENGTH is the number of elements in the annotation.
1077    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1078    NCHARS is the number of characters in the text annotated.
1079
1080    The format of the following elements depend on ANNOTATION_MASK.
1081
1082    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1083    follows:
1084      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1085    METHOD is one of enum composition_method.
1086    Optionnal COMPOSITION-COMPONENTS are characters and composition
1087    rules.
1088
1089    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1090    follows.  */
1091
1092 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1093   do {                                                  \
1094     *(buf)++ = -(len);                                  \
1095     *(buf)++ = (mask);                                  \
1096     *(buf)++ = (nchars);                                \
1097     coding->annotated = 1;                              \
1098   } while (0);
1099
1100 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1101   do {                                                                      \
1102     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1103     *buf++ = method;                                                        \
1104   } while (0)
1105
1106
1107 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1108   do {                                                                  \
1109     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1110     *buf++ = id;                                                        \
1111   } while (0)
1112
1113 \f
1114 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1115
1116
1117
1118 \f
1119 /*** 3. UTF-8 ***/
1120
1121 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1122    Check if a text is encoded in UTF-8.  If it is, return 1, else
1123    return 0.  */
1124
1125 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1126 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1127 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1128 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1129 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1130 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1131
1132 static int
1133 detect_coding_utf_8 (coding, detect_info)
1134      struct coding_system *coding;
1135      struct coding_detection_info *detect_info;
1136 {
1137   const unsigned char *src = coding->source, *src_base;
1138   const unsigned char *src_end = coding->source + coding->src_bytes;
1139   int multibytep = coding->src_multibyte;
1140   int consumed_chars = 0;
1141   int found = 0;
1142
1143   detect_info->checked |= CATEGORY_MASK_UTF_8;
1144   /* A coding system of this category is always ASCII compatible.  */
1145   src += coding->head_ascii;
1146
1147   while (1)
1148     {
1149       int c, c1, c2, c3, c4;
1150
1151       src_base = src;
1152       ONE_MORE_BYTE (c);
1153       if (c < 0 || UTF_8_1_OCTET_P (c))
1154         continue;
1155       ONE_MORE_BYTE (c1);
1156       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1157         break;
1158       if (UTF_8_2_OCTET_LEADING_P (c))
1159         {
1160           found = CATEGORY_MASK_UTF_8;
1161           continue;
1162         }
1163       ONE_MORE_BYTE (c2);
1164       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1165         break;
1166       if (UTF_8_3_OCTET_LEADING_P (c))
1167         {
1168           found = CATEGORY_MASK_UTF_8;
1169           continue;
1170         }
1171       ONE_MORE_BYTE (c3);
1172       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1173         break;
1174       if (UTF_8_4_OCTET_LEADING_P (c))
1175         {
1176           found = CATEGORY_MASK_UTF_8;
1177           continue;
1178         }
1179       ONE_MORE_BYTE (c4);
1180       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1181         break;
1182       if (UTF_8_5_OCTET_LEADING_P (c))
1183         {
1184           found = CATEGORY_MASK_UTF_8;
1185           continue;
1186         }
1187       break;
1188     }
1189   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1190   return 0;
1191
1192  no_more_source:
1193   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1194     {
1195       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196       return 0;
1197     }
1198   detect_info->found |= found;
1199   return 1;
1200 }
1201
1202
1203 static void
1204 decode_coding_utf_8 (coding)
1205      struct coding_system *coding;
1206 {
1207   const unsigned char *src = coding->source + coding->consumed;
1208   const unsigned char *src_end = coding->source + coding->src_bytes;
1209   const unsigned char *src_base;
1210   int *charbuf = coding->charbuf + coding->charbuf_used;
1211   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1212   int consumed_chars = 0, consumed_chars_base;
1213   int multibytep = coding->src_multibyte;
1214   Lisp_Object attr, charset_list;
1215
1216   CODING_GET_INFO (coding, attr, charset_list);
1217
1218   while (1)
1219     {
1220       int c, c1, c2, c3, c4, c5;
1221
1222       src_base = src;
1223       consumed_chars_base = consumed_chars;
1224
1225       if (charbuf >= charbuf_end)
1226         break;
1227
1228       ONE_MORE_BYTE (c1);
1229       if (c1 < 0)
1230         {
1231           c = - c1;
1232         }
1233       else if (UTF_8_1_OCTET_P(c1))
1234         {
1235           c = c1;
1236         }
1237       else
1238         {
1239           ONE_MORE_BYTE (c2);
1240           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1241             goto invalid_code;
1242           if (UTF_8_2_OCTET_LEADING_P (c1))
1243             {
1244               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1245               /* Reject overlong sequences here and below.  Encoders
1246                  producing them are incorrect, they can be misleading,
1247                  and they mess up read/write invariance.  */
1248               if (c < 128)
1249                 goto invalid_code;
1250             }
1251           else
1252             {
1253               ONE_MORE_BYTE (c3);
1254               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1255                 goto invalid_code;
1256               if (UTF_8_3_OCTET_LEADING_P (c1))
1257                 {
1258                   c = (((c1 & 0xF) << 12)
1259                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1260                   if (c < 0x800
1261                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1262                     goto invalid_code;
1263                 }
1264               else
1265                 {
1266                   ONE_MORE_BYTE (c4);
1267                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1268                     goto invalid_code;
1269                   if (UTF_8_4_OCTET_LEADING_P (c1))
1270                     {
1271                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1272                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1273                     if (c < 0x10000)
1274                       goto invalid_code;
1275                     }
1276                   else
1277                     {
1278                       ONE_MORE_BYTE (c5);
1279                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1280                         goto invalid_code;
1281                       if (UTF_8_5_OCTET_LEADING_P (c1))
1282                         {
1283                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1284                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1285                                | (c5 & 0x3F));
1286                           if ((c > MAX_CHAR) || (c < 0x200000))
1287                             goto invalid_code;
1288                         }
1289                       else
1290                         goto invalid_code;
1291                     }
1292                 }
1293             }
1294         }
1295
1296       *charbuf++ = c;
1297       continue;
1298
1299     invalid_code:
1300       src = src_base;
1301       consumed_chars = consumed_chars_base;
1302       ONE_MORE_BYTE (c);
1303       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1304       coding->errors++;
1305     }
1306
1307  no_more_source:
1308   coding->consumed_char += consumed_chars_base;
1309   coding->consumed = src_base - coding->source;
1310   coding->charbuf_used = charbuf - coding->charbuf;
1311 }
1312
1313
1314 static int
1315 encode_coding_utf_8 (coding)
1316      struct coding_system *coding;
1317 {
1318   int multibytep = coding->dst_multibyte;
1319   int *charbuf = coding->charbuf;
1320   int *charbuf_end = charbuf + coding->charbuf_used;
1321   unsigned char *dst = coding->destination + coding->produced;
1322   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1323   int produced_chars = 0;
1324   int c;
1325
1326   if (multibytep)
1327     {
1328       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1329
1330       while (charbuf < charbuf_end)
1331         {
1332           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1333
1334           ASSURE_DESTINATION (safe_room);
1335           c = *charbuf++;
1336           if (CHAR_BYTE8_P (c))
1337             {
1338               c = CHAR_TO_BYTE8 (c);
1339               EMIT_ONE_BYTE (c);
1340             }
1341           else
1342             {
1343               CHAR_STRING_ADVANCE (c, pend);
1344               for (p = str; p < pend; p++)
1345                 EMIT_ONE_BYTE (*p);
1346             }
1347         }
1348     }
1349   else
1350     {
1351       int safe_room = MAX_MULTIBYTE_LENGTH;
1352
1353       while (charbuf < charbuf_end)
1354         {
1355           ASSURE_DESTINATION (safe_room);
1356           c = *charbuf++;
1357           if (CHAR_BYTE8_P (c))
1358             *dst++ = CHAR_TO_BYTE8 (c);
1359           else
1360             dst += CHAR_STRING (c, dst);
1361           produced_chars++;
1362         }
1363     }
1364   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1365   coding->produced_char += produced_chars;
1366   coding->produced = dst - coding->destination;
1367   return 0;
1368 }
1369
1370
1371 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1372    Check if a text is encoded in one of UTF-16 based coding systems.
1373    If it is, return 1, else return 0.  */
1374
1375 #define UTF_16_HIGH_SURROGATE_P(val) \
1376   (((val) & 0xFC00) == 0xD800)
1377
1378 #define UTF_16_LOW_SURROGATE_P(val) \
1379   (((val) & 0xFC00) == 0xDC00)
1380
1381 #define UTF_16_INVALID_P(val)   \
1382   (((val) == 0xFFFE)            \
1383    || ((val) == 0xFFFF)         \
1384    || UTF_16_LOW_SURROGATE_P (val))
1385
1386
1387 static int
1388 detect_coding_utf_16 (coding, detect_info)
1389      struct coding_system *coding;
1390      struct coding_detection_info *detect_info;
1391 {
1392   const unsigned char *src = coding->source, *src_base = src;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   int multibytep = coding->src_multibyte;
1395   int consumed_chars = 0;
1396   int c1, c2;
1397
1398   detect_info->checked |= CATEGORY_MASK_UTF_16;
1399   if (coding->mode & CODING_MODE_LAST_BLOCK
1400       && (coding->src_chars & 1))
1401     {
1402       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1403       return 0;
1404     }
1405
1406   ONE_MORE_BYTE (c1);
1407   ONE_MORE_BYTE (c2);
1408   if ((c1 == 0xFF) && (c2 == 0xFE))
1409     {
1410       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1411                              | CATEGORY_MASK_UTF_16_AUTO);
1412       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1413                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1414                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1415     }
1416   else if ((c1 == 0xFE) && (c2 == 0xFF))
1417     {
1418       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1419                              | CATEGORY_MASK_UTF_16_AUTO);
1420       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1421                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1422                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1423     }
1424   else if (c1 >= 0 && c2 >= 0)
1425     {
1426       detect_info->rejected
1427         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1428     }
1429  no_more_source:
1430   return 1;
1431 }
1432
1433 static void
1434 decode_coding_utf_16 (coding)
1435      struct coding_system *coding;
1436 {
1437   const unsigned char *src = coding->source + coding->consumed;
1438   const unsigned char *src_end = coding->source + coding->src_bytes;
1439   const unsigned char *src_base;
1440   int *charbuf = coding->charbuf + coding->charbuf_used;
1441   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1442   int consumed_chars = 0, consumed_chars_base;
1443   int multibytep = coding->src_multibyte;
1444   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1445   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1446   int surrogate = CODING_UTF_16_SURROGATE (coding);
1447   Lisp_Object attr, charset_list;
1448
1449   CODING_GET_INFO (coding, attr, charset_list);
1450
1451   if (bom == utf_16_with_bom)
1452     {
1453       int c, c1, c2;
1454
1455       src_base = src;
1456       ONE_MORE_BYTE (c1);
1457       ONE_MORE_BYTE (c2);
1458       c = (c1 << 8) | c2;
1459
1460       if (endian == utf_16_big_endian
1461           ? c != 0xFEFF : c != 0xFFFE)
1462         {
1463           /* The first two bytes are not BOM.  Treat them as bytes
1464              for a normal character.  */
1465           src = src_base;
1466           coding->errors++;
1467         }
1468       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1469     }
1470   else if (bom == utf_16_detect_bom)
1471     {
1472       /* We have already tried to detect BOM and failed in
1473          detect_coding.  */
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476
1477   while (1)
1478     {
1479       int c, c1, c2;
1480
1481       src_base = src;
1482       consumed_chars_base = consumed_chars;
1483
1484       if (charbuf + 2 >= charbuf_end)
1485         break;
1486
1487       ONE_MORE_BYTE (c1);
1488       if (c1 < 0)
1489         {
1490           *charbuf++ = -c1;
1491           continue;
1492         }
1493       ONE_MORE_BYTE (c2);
1494       if (c2 < 0)
1495         {
1496           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1497           *charbuf++ = -c2;
1498           continue;
1499         }
1500       c = (endian == utf_16_big_endian
1501            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1502       if (surrogate)
1503         {
1504           if (! UTF_16_LOW_SURROGATE_P (c))
1505             {
1506               if (endian == utf_16_big_endian)
1507                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1508               else
1509                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1510               *charbuf++ = c1;
1511               *charbuf++ = c2;
1512               coding->errors++;
1513               if (UTF_16_HIGH_SURROGATE_P (c))
1514                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1515               else
1516                 *charbuf++ = c;
1517             }
1518           else
1519             {
1520               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1521               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1522               *charbuf++ = 0x10000 + c;
1523             }
1524         }
1525       else
1526         {
1527           if (UTF_16_HIGH_SURROGATE_P (c))
1528             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1529           else
1530             *charbuf++ = c;
1531         }
1532     }
1533
1534  no_more_source:
1535   coding->consumed_char += consumed_chars_base;
1536   coding->consumed = src_base - coding->source;
1537   coding->charbuf_used = charbuf - coding->charbuf;
1538 }
1539
1540 static int
1541 encode_coding_utf_16 (coding)
1542      struct coding_system *coding;
1543 {
1544   int multibytep = coding->dst_multibyte;
1545   int *charbuf = coding->charbuf;
1546   int *charbuf_end = charbuf + coding->charbuf_used;
1547   unsigned char *dst = coding->destination + coding->produced;
1548   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1549   int safe_room = 8;
1550   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1551   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1552   int produced_chars = 0;
1553   Lisp_Object attrs, charset_list;
1554   int c;
1555
1556   CODING_GET_INFO (coding, attrs, charset_list);
1557
1558   if (bom != utf_16_without_bom)
1559     {
1560       ASSURE_DESTINATION (safe_room);
1561       if (big_endian)
1562         EMIT_TWO_BYTES (0xFE, 0xFF);
1563       else
1564         EMIT_TWO_BYTES (0xFF, 0xFE);
1565       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1566     }
1567
1568   while (charbuf < charbuf_end)
1569     {
1570       ASSURE_DESTINATION (safe_room);
1571       c = *charbuf++;
1572       if (c >= MAX_UNICODE_CHAR)
1573         c = coding->default_char;
1574
1575       if (c < 0x10000)
1576         {
1577           if (big_endian)
1578             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1579           else
1580             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1581         }
1582       else
1583         {
1584           int c1, c2;
1585
1586           c -= 0x10000;
1587           c1 = (c >> 10) + 0xD800;
1588           c2 = (c & 0x3FF) + 0xDC00;
1589           if (big_endian)
1590             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1591           else
1592             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1593         }
1594     }
1595   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1596   coding->produced = dst - coding->destination;
1597   coding->produced_char += produced_chars;
1598   return 0;
1599 }
1600
1601 \f
1602 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1603
1604 /* Emacs' internal format for representation of multiple character
1605    sets is a kind of multi-byte encoding, i.e. characters are
1606    represented by variable-length sequences of one-byte codes.
1607
1608    ASCII characters and control characters (e.g. `tab', `newline') are
1609    represented by one-byte sequences which are their ASCII codes, in
1610    the range 0x00 through 0x7F.
1611
1612    8-bit characters of the range 0x80..0x9F are represented by
1613    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1614    code + 0x20).
1615
1616    8-bit characters of the range 0xA0..0xFF are represented by
1617    one-byte sequences which are their 8-bit code.
1618
1619    The other characters are represented by a sequence of `base
1620    leading-code', optional `extended leading-code', and one or two
1621    `position-code's.  The length of the sequence is determined by the
1622    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1623    whereas extended leading-code and position-code take the range 0xA0
1624    through 0xFF.  See `charset.h' for more details about leading-code
1625    and position-code.
1626
1627    --- CODE RANGE of Emacs' internal format ---
1628    character set        range
1629    -------------        -----
1630    ascii                0x00..0x7F
1631    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1632    eight-bit-graphic    0xA0..0xBF
1633    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1634    ---------------------------------------------
1635
1636    As this is the internal character representation, the format is
1637    usually not used externally (i.e. in a file or in a data sent to a
1638    process).  But, it is possible to have a text externally in this
1639    format (i.e. by encoding by the coding system `emacs-mule').
1640
1641    In that case, a sequence of one-byte codes has a slightly different
1642    form.
1643
1644    At first, all characters in eight-bit-control are represented by
1645    one-byte sequences which are their 8-bit code.
1646
1647    Next, character composition data are represented by the byte
1648    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1649    where,
1650         METHOD is 0xF0 plus one of composition method (enum
1651         composition_method),
1652
1653         BYTES is 0xA0 plus a byte length of this composition data,
1654
1655         CHARS is 0x20 plus a number of characters composed by this
1656         data,
1657
1658         COMPONENTs are characters of multibye form or composition
1659         rules encoded by two-byte of ASCII codes.
1660
1661    In addition, for backward compatibility, the following formats are
1662    also recognized as composition data on decoding.
1663
1664    0x80 MSEQ ...
1665    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1666
1667    Here,
1668         MSEQ is a multibyte form but in these special format:
1669           ASCII: 0xA0 ASCII_CODE+0x80,
1670           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1671         RULE is a one byte code of the range 0xA0..0xF0 that
1672         represents a composition rule.
1673   */
1674
1675 char emacs_mule_bytes[256];
1676
1677 int
1678 emacs_mule_char (coding, src, nbytes, nchars, id)
1679      struct coding_system *coding;
1680      const unsigned char *src;
1681      int *nbytes, *nchars, *id;
1682 {
1683   const unsigned char *src_end = coding->source + coding->src_bytes;
1684   const unsigned char *src_base = src;
1685   int multibytep = coding->src_multibyte;
1686   struct charset *charset;
1687   unsigned code;
1688   int c;
1689   int consumed_chars = 0;
1690
1691   ONE_MORE_BYTE (c);
1692   if (c < 0)
1693     {
1694       c = -c;
1695       charset = emacs_mule_charset[0];
1696     }
1697   else
1698     {
1699       switch (emacs_mule_bytes[c])
1700         {
1701         case 2:
1702           if (! (charset = emacs_mule_charset[c]))
1703             goto invalid_code;
1704           ONE_MORE_BYTE (c);
1705           if (c < 0xA0)
1706             goto invalid_code;
1707           code = c & 0x7F;
1708           break;
1709
1710         case 3:
1711           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1712               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1713             {
1714               ONE_MORE_BYTE (c);
1715               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1716                 goto invalid_code;
1717               ONE_MORE_BYTE (c);
1718               if (c < 0xA0)
1719                 goto invalid_code;
1720               code = c & 0x7F;
1721             }
1722           else
1723             {
1724               if (! (charset = emacs_mule_charset[c]))
1725                 goto invalid_code;
1726               ONE_MORE_BYTE (c);
1727               if (c < 0xA0)
1728                 goto invalid_code;
1729               code = (c & 0x7F) << 8;
1730               ONE_MORE_BYTE (c);
1731               if (c < 0xA0)
1732                 goto invalid_code;
1733               code |= c & 0x7F;
1734             }
1735           break;
1736
1737         case 4:
1738           ONE_MORE_BYTE (c);
1739           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1740             goto invalid_code;
1741           ONE_MORE_BYTE (c);
1742           if (c < 0xA0)
1743             goto invalid_code;
1744           code = (c & 0x7F) << 8;
1745           ONE_MORE_BYTE (c);
1746           if (c < 0xA0)
1747             goto invalid_code;
1748           code |= c & 0x7F;
1749           break;
1750
1751         case 1:
1752           code = c;
1753           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1754                                      ? charset_ascii : charset_eight_bit);
1755           break;
1756
1757         default:
1758           abort ();
1759         }
1760       c = DECODE_CHAR (charset, code);
1761       if (c < 0)
1762         goto invalid_code;
1763     }
1764   *nbytes = src - src_base;
1765   *nchars = consumed_chars;
1766   if (id)
1767     *id = charset->id;
1768   return c;
1769
1770  no_more_source:
1771   return -2;
1772
1773  invalid_code:
1774   return -1;
1775 }
1776
1777
1778 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1779    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1780    else return 0.  */
1781
1782 static int
1783 detect_coding_emacs_mule (coding, detect_info)
1784      struct coding_system *coding;
1785      struct coding_detection_info *detect_info;
1786 {
1787   const unsigned char *src = coding->source, *src_base;
1788   const unsigned char *src_end = coding->source + coding->src_bytes;
1789   int multibytep = coding->src_multibyte;
1790   int consumed_chars = 0;
1791   int c;
1792   int found = 0;
1793
1794   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1795   /* A coding system of this category is always ASCII compatible.  */
1796   src += coding->head_ascii;
1797
1798   while (1)
1799     {
1800       src_base = src;
1801       ONE_MORE_BYTE (c);
1802       if (c < 0)
1803         continue;
1804       if (c == 0x80)
1805         {
1806           /* Perhaps the start of composite character.  We simple skip
1807              it because analyzing it is too heavy for detecting.  But,
1808              at least, we check that the composite character
1809              constitues of more than 4 bytes.  */
1810           const unsigned char *src_base;
1811
1812         repeat:
1813           src_base = src;
1814           do
1815             {
1816               ONE_MORE_BYTE (c);
1817             }
1818           while (c >= 0xA0);
1819
1820           if (src - src_base <= 4)
1821             break;
1822           found = CATEGORY_MASK_EMACS_MULE;
1823           if (c == 0x80)
1824             goto repeat;
1825         }
1826
1827       if (c < 0x80)
1828         {
1829           if (c < 0x20
1830               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1831             break;
1832         }
1833       else
1834         {
1835           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1836
1837           while (more_bytes > 0)
1838             {
1839               ONE_MORE_BYTE (c);
1840               if (c < 0xA0)
1841                 {
1842                   src--;        /* Unread the last byte.  */
1843                   break;
1844                 }
1845               more_bytes--;
1846             }
1847           if (more_bytes != 0)
1848             break;
1849           found = CATEGORY_MASK_EMACS_MULE;
1850         }
1851     }
1852   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1853   return 0;
1854
1855  no_more_source:
1856   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1857     {
1858       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1859       return 0;
1860     }
1861   detect_info->found |= found;
1862   return 1;
1863 }
1864
1865
1866 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1867
1868 /* Decode a character represented as a component of composition
1869    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1870    update SRC to the head of next character (or an encoded composition
1871    rule).  If SRC doesn't points a composition component, set C to -1.
1872    If SRC points an invalid byte sequence, global exit by a return
1873    value 0.  */
1874
1875 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1876   if (1)                                                        \
1877     {                                                           \
1878       int c;                                                    \
1879       int nbytes, nchars;                                       \
1880                                                                 \
1881       if (src == src_end)                                       \
1882         break;                                                  \
1883       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1884       if (c < 0)                                                \
1885         {                                                       \
1886           if (c == -2)                                          \
1887             break;                                              \
1888           goto invalid_code;                                    \
1889         }                                                       \
1890       *buf++ = c;                                               \
1891       src += nbytes;                                            \
1892       consumed_chars += nchars;                                 \
1893     }                                                           \
1894   else
1895
1896
1897 /* Decode a composition rule represented as a component of composition
1898    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1899    and increment BUF.  If SRC points an invalid byte sequence, set C
1900    to -1.  */
1901
1902 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1903   do {                                                  \
1904     int c, gref, nref;                                  \
1905                                                         \
1906     if (src >= src_end)                                 \
1907       goto invalid_code;                                \
1908     ONE_MORE_BYTE_NO_CHECK (c);                         \
1909     c -= 0x20;                                          \
1910     if (c < 0 || c >= 81)                               \
1911       goto invalid_code;                                \
1912                                                         \
1913     gref = c / 9, nref = c % 9;                         \
1914     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1915   } while (0)
1916
1917
1918 /* Decode a composition rule represented as a component of composition
1919    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1920    and increment BUF.  If SRC points an invalid byte sequence, set C
1921    to -1.  */
1922
1923 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1924   do {                                                  \
1925     int gref, nref;                                     \
1926                                                         \
1927     if (src + 1>= src_end)                              \
1928       goto invalid_code;                                \
1929     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1930     gref -= 0x20;                                       \
1931     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1932     nref -= 0x20;                                       \
1933     if (gref < 0 || gref >= 81                          \
1934         || nref < 0 || nref >= 81)                      \
1935       goto invalid_code;                                \
1936     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1937   } while (0)
1938
1939
1940 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1941   do {                                                                  \
1942     /* Emacs 21 style format.  The first three bytes at SRC are         \
1943        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1944        the byte length of this composition information, CHARS is the    \
1945        number of characters composed by this composition.  */           \
1946     enum composition_method method = c - 0xF2;                          \
1947     int *charbuf_base = charbuf;                                        \
1948     int consumed_chars_limit;                                           \
1949     int nbytes, nchars;                                                 \
1950                                                                         \
1951     ONE_MORE_BYTE (c);                                                  \
1952     if (c < 0)                                                          \
1953       goto invalid_code;                                                \
1954     nbytes = c - 0xA0;                                                  \
1955     if (nbytes < 3)                                                     \
1956       goto invalid_code;                                                \
1957     ONE_MORE_BYTE (c);                                                  \
1958     if (c < 0)                                                          \
1959       goto invalid_code;                                                \
1960     nchars = c - 0xA0;                                                  \
1961     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1962     consumed_chars_limit = consumed_chars_base + nbytes;                \
1963     if (method != COMPOSITION_RELATIVE)                                 \
1964       {                                                                 \
1965         int i = 0;                                                      \
1966         while (consumed_chars < consumed_chars_limit)                   \
1967           {                                                             \
1968             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1969               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1970             else                                                        \
1971               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1972             i++;                                                        \
1973           }                                                             \
1974         if (consumed_chars < consumed_chars_limit)                      \
1975           goto invalid_code;                                            \
1976         charbuf_base[0] -= i;                                           \
1977       }                                                                 \
1978   } while (0)
1979
1980
1981 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1982   do {                                                          \
1983     /* Emacs 20 style format for relative composition.  */      \
1984     /* Store multibyte form of characters to be composed.  */   \
1985     enum composition_method method = COMPOSITION_RELATIVE;      \
1986     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1987     int *buf = components;                                      \
1988     int i, j;                                                   \
1989                                                                 \
1990     src = src_base;                                             \
1991     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1992     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1993       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1994     if (i < 2)                                                  \
1995       goto invalid_code;                                        \
1996     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
1997     for (j = 0; j < i; j++)                                     \
1998       *charbuf++ = components[j];                               \
1999   } while (0)
2000
2001
2002 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2003   do {                                                          \
2004     /* Emacs 20 style format for rule-base composition.  */     \
2005     /* Store multibyte form of characters to be composed.  */   \
2006     enum composition_method method = COMPOSITION_WITH_RULE;     \
2007     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2008     int *buf = components;                                      \
2009     int i, j;                                                   \
2010                                                                 \
2011     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2012     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2013       {                                                         \
2014         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2015         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2016       }                                                         \
2017     if (i < 1 || (buf - components) % 2 == 0)                   \
2018       goto invalid_code;                                        \
2019     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2020       goto no_more_source;                                      \
2021     ADD_COMPOSITION_DATA (buf, i, method);                      \
2022     for (j = 0; j < i; j++)                                     \
2023       *charbuf++ = components[j];                               \
2024     for (j = 0; j < i; j += 2)                                  \
2025       *charbuf++ = components[j];                               \
2026   } while (0)
2027
2028
2029 static void
2030 decode_coding_emacs_mule (coding)
2031      struct coding_system *coding;
2032 {
2033   const unsigned char *src = coding->source + coding->consumed;
2034   const unsigned char *src_end = coding->source + coding->src_bytes;
2035   const unsigned char *src_base;
2036   int *charbuf = coding->charbuf + coding->charbuf_used;
2037   int *charbuf_end
2038     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2039   int consumed_chars = 0, consumed_chars_base;
2040   int multibytep = coding->src_multibyte;
2041   Lisp_Object attrs, charset_list;
2042   int char_offset = coding->produced_char;
2043   int last_offset = char_offset;
2044   int last_id = charset_ascii;
2045
2046   CODING_GET_INFO (coding, attrs, charset_list);
2047
2048   while (1)
2049     {
2050       int c;
2051
2052       src_base = src;
2053       consumed_chars_base = consumed_chars;
2054
2055       if (charbuf >= charbuf_end)
2056         break;
2057
2058       ONE_MORE_BYTE (c);
2059       if (c < 0)
2060         {
2061           *charbuf++ = -c;
2062           char_offset++;
2063         }
2064       else if (c < 0x80)
2065         {
2066           *charbuf++ = c;
2067           char_offset++;
2068         }
2069       else if (c == 0x80)
2070         {
2071           ONE_MORE_BYTE (c);
2072           if (c < 0)
2073             goto invalid_code;
2074           if (c - 0xF2 >= COMPOSITION_RELATIVE
2075               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2076             DECODE_EMACS_MULE_21_COMPOSITION (c);
2077           else if (c < 0xC0)
2078             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2079           else if (c == 0xFF)
2080             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2081           else
2082             goto invalid_code;
2083         }
2084       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2085         {
2086           int nbytes, nchars;
2087           int id;
2088
2089           src = src_base;
2090           consumed_chars = consumed_chars_base;
2091           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2092           if (c < 0)
2093             {
2094               if (c == -2)
2095                 break;
2096               goto invalid_code;
2097             }
2098           if (last_id != id)
2099             {
2100               if (last_id != charset_ascii)
2101                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2102               last_id = id;
2103               last_offset = char_offset;
2104             }
2105           *charbuf++ = c;
2106           src += nbytes;
2107           consumed_chars += nchars;
2108           char_offset++;
2109         }
2110       continue;
2111
2112     invalid_code:
2113       src = src_base;
2114       consumed_chars = consumed_chars_base;
2115       ONE_MORE_BYTE (c);
2116       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2117       char_offset++;
2118       coding->errors++;
2119     }
2120
2121  no_more_source:
2122   if (last_id != charset_ascii)
2123     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2124   coding->consumed_char += consumed_chars_base;
2125   coding->consumed = src_base - coding->source;
2126   coding->charbuf_used = charbuf - coding->charbuf;
2127 }
2128
2129
2130 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2131   do {                                          \
2132     if (id < 0xA0)                              \
2133       codes[0] = id, codes[1] = 0;              \
2134     else if (id < 0xE0)                         \
2135       codes[0] = 0x9A, codes[1] = id;           \
2136     else if (id < 0xF0)                         \
2137       codes[0] = 0x9B, codes[1] = id;           \
2138     else if (id < 0xF5)                         \
2139       codes[0] = 0x9C, codes[1] = id;           \
2140     else                                        \
2141       codes[0] = 0x9D, codes[1] = id;           \
2142   } while (0);
2143
2144
2145 static int
2146 encode_coding_emacs_mule (coding)
2147      struct coding_system *coding;
2148 {
2149   int multibytep = coding->dst_multibyte;
2150   int *charbuf = coding->charbuf;
2151   int *charbuf_end = charbuf + coding->charbuf_used;
2152   unsigned char *dst = coding->destination + coding->produced;
2153   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2154   int safe_room = 8;
2155   int produced_chars = 0;
2156   Lisp_Object attrs, charset_list;
2157   int c;
2158   int preferred_charset_id = -1;
2159
2160   CODING_GET_INFO (coding, attrs, charset_list);
2161   if (! EQ (charset_list, Vemacs_mule_charset_list))
2162     {
2163       CODING_ATTR_CHARSET_LIST (attrs)
2164         = charset_list = Vemacs_mule_charset_list;
2165     }
2166
2167   while (charbuf < charbuf_end)
2168     {
2169       ASSURE_DESTINATION (safe_room);
2170       c = *charbuf++;
2171
2172       if (c < 0)
2173         {
2174           /* Handle an annotation.  */
2175           switch (*charbuf)
2176             {
2177             case CODING_ANNOTATE_COMPOSITION_MASK:
2178               /* Not yet implemented.  */
2179               break;
2180             case CODING_ANNOTATE_CHARSET_MASK:
2181               preferred_charset_id = charbuf[3];
2182               if (preferred_charset_id >= 0
2183                   && NILP (Fmemq (make_number (preferred_charset_id),
2184                                   charset_list)))
2185                 preferred_charset_id = -1;
2186               break;
2187             default:
2188               abort ();
2189             }
2190           charbuf += -c - 1;
2191           continue;
2192         }
2193
2194       if (ASCII_CHAR_P (c))
2195         EMIT_ONE_ASCII_BYTE (c);
2196       else if (CHAR_BYTE8_P (c))
2197         {
2198           c = CHAR_TO_BYTE8 (c);
2199           EMIT_ONE_BYTE (c);
2200         }
2201       else
2202         {
2203           struct charset *charset;
2204           unsigned code;
2205           int dimension;
2206           int emacs_mule_id;
2207           unsigned char leading_codes[2];
2208
2209           if (preferred_charset_id >= 0)
2210             {
2211               charset = CHARSET_FROM_ID (preferred_charset_id);
2212               if (! CHAR_CHARSET_P (c, charset))
2213                 charset = char_charset (c, charset_list, NULL);
2214             }
2215           else
2216             charset = char_charset (c, charset_list, &code);
2217           if (! charset)
2218             {
2219               c = coding->default_char;
2220               if (ASCII_CHAR_P (c))
2221                 {
2222                   EMIT_ONE_ASCII_BYTE (c);
2223                   continue;
2224                 }
2225               charset = char_charset (c, charset_list, &code);
2226             }
2227           dimension = CHARSET_DIMENSION (charset);
2228           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2229           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2230           EMIT_ONE_BYTE (leading_codes[0]);
2231           if (leading_codes[1])
2232             EMIT_ONE_BYTE (leading_codes[1]);
2233           if (dimension == 1)
2234             EMIT_ONE_BYTE (code | 0x80);
2235           else
2236             {
2237               code |= 0x8080;
2238               EMIT_ONE_BYTE (code >> 8);
2239               EMIT_ONE_BYTE (code & 0xFF);
2240             }
2241         }
2242     }
2243   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2244   coding->produced_char += produced_chars;
2245   coding->produced = dst - coding->destination;
2246   return 0;
2247 }
2248
2249 \f
2250 /*** 7. ISO2022 handlers ***/
2251
2252 /* The following note describes the coding system ISO2022 briefly.
2253    Since the intention of this note is to help understand the
2254    functions in this file, some parts are NOT ACCURATE or are OVERLY
2255    SIMPLIFIED.  For thorough understanding, please refer to the
2256    original document of ISO2022.  This is equivalent to the standard
2257    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2258
2259    ISO2022 provides many mechanisms to encode several character sets
2260    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2261    is encoded using bytes less than 128.  This may make the encoded
2262    text a little bit longer, but the text passes more easily through
2263    several types of gateway, some of which strip off the MSB (Most
2264    Significant Bit).
2265
2266    There are two kinds of character sets: control character sets and
2267    graphic character sets.  The former contain control characters such
2268    as `newline' and `escape' to provide control functions (control
2269    functions are also provided by escape sequences).  The latter
2270    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2271    two control character sets and many graphic character sets.
2272
2273    Graphic character sets are classified into one of the following
2274    four classes, according to the number of bytes (DIMENSION) and
2275    number of characters in one dimension (CHARS) of the set:
2276    - DIMENSION1_CHARS94
2277    - DIMENSION1_CHARS96
2278    - DIMENSION2_CHARS94
2279    - DIMENSION2_CHARS96
2280
2281    In addition, each character set is assigned an identification tag,
2282    unique for each set, called the "final character" (denoted as <F>
2283    hereafter).  The <F> of each character set is decided by ECMA(*)
2284    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2285    (0x30..0x3F are for private use only).
2286
2287    Note (*): ECMA = European Computer Manufacturers Association
2288
2289    Here are examples of graphic character sets [NAME(<F>)]:
2290         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2291         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2292         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2293         o DIMENSION2_CHARS96 -- none for the moment
2294
2295    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2296         C0 [0x00..0x1F] -- control character plane 0
2297         GL [0x20..0x7F] -- graphic character plane 0
2298         C1 [0x80..0x9F] -- control character plane 1
2299         GR [0xA0..0xFF] -- graphic character plane 1
2300
2301    A control character set is directly designated and invoked to C0 or
2302    C1 by an escape sequence.  The most common case is that:
2303    - ISO646's  control character set is designated/invoked to C0, and
2304    - ISO6429's control character set is designated/invoked to C1,
2305    and usually these designations/invocations are omitted in encoded
2306    text.  In a 7-bit environment, only C0 can be used, and a control
2307    character for C1 is encoded by an appropriate escape sequence to
2308    fit into the environment.  All control characters for C1 are
2309    defined to have corresponding escape sequences.
2310
2311    A graphic character set is at first designated to one of four
2312    graphic registers (G0 through G3), then these graphic registers are
2313    invoked to GL or GR.  These designations and invocations can be
2314    done independently.  The most common case is that G0 is invoked to
2315    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2316    these invocations and designations are omitted in encoded text.
2317    In a 7-bit environment, only GL can be used.
2318
2319    When a graphic character set of CHARS94 is invoked to GL, codes
2320    0x20 and 0x7F of the GL area work as control characters SPACE and
2321    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2322    be used.
2323
2324    There are two ways of invocation: locking-shift and single-shift.
2325    With locking-shift, the invocation lasts until the next different
2326    invocation, whereas with single-shift, the invocation affects the
2327    following character only and doesn't affect the locking-shift
2328    state.  Invocations are done by the following control characters or
2329    escape sequences:
2330
2331    ----------------------------------------------------------------------
2332    abbrev  function                  cntrl escape seq   description
2333    ----------------------------------------------------------------------
2334    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2335    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2336    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2337    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2338    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2339    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2340    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2341    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2342    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2343    ----------------------------------------------------------------------
2344    (*) These are not used by any known coding system.
2345
2346    Control characters for these functions are defined by macros
2347    ISO_CODE_XXX in `coding.h'.
2348
2349    Designations are done by the following escape sequences:
2350    ----------------------------------------------------------------------
2351    escape sequence      description
2352    ----------------------------------------------------------------------
2353    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2354    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2355    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2356    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2357    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2358    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2359    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2360    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2361    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2362    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2363    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2364    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2365    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2366    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2367    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2368    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2369    ----------------------------------------------------------------------
2370
2371    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2372    of dimension 1, chars 94, and final character <F>, etc...
2373
2374    Note (*): Although these designations are not allowed in ISO2022,
2375    Emacs accepts them on decoding, and produces them on encoding
2376    CHARS96 character sets in a coding system which is characterized as
2377    7-bit environment, non-locking-shift, and non-single-shift.
2378
2379    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2380    '(' must be omitted.  We refer to this as "short-form" hereafter.
2381
2382    Now you may notice that there are a lot of ways of encoding the
2383    same multilingual text in ISO2022.  Actually, there exist many
2384    coding systems such as Compound Text (used in X11's inter client
2385    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2386    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2387    localized platforms), and all of these are variants of ISO2022.
2388
2389    In addition to the above, Emacs handles two more kinds of escape
2390    sequences: ISO6429's direction specification and Emacs' private
2391    sequence for specifying character composition.
2392
2393    ISO6429's direction specification takes the following form:
2394         o CSI ']'      -- end of the current direction
2395         o CSI '0' ']'  -- end of the current direction
2396         o CSI '1' ']'  -- start of left-to-right text
2397         o CSI '2' ']'  -- start of right-to-left text
2398    The control character CSI (0x9B: control sequence introducer) is
2399    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2400
2401    Character composition specification takes the following form:
2402         o ESC '0' -- start relative composition
2403         o ESC '1' -- end composition
2404         o ESC '2' -- start rule-base composition (*)
2405         o ESC '3' -- start relative composition with alternate chars  (**)
2406         o ESC '4' -- start rule-base composition with alternate chars  (**)
2407   Since these are not standard escape sequences of any ISO standard,
2408   the use of them with these meanings is restricted to Emacs only.
2409
2410   (*) This form is used only in Emacs 20.7 and older versions,
2411   but newer versions can safely decode it.
2412   (**) This form is used only in Emacs 21.1 and newer versions,
2413   and older versions can't decode it.
2414
2415   Here's a list of example usages of these composition escape
2416   sequences (categorized by `enum composition_method').
2417
2418   COMPOSITION_RELATIVE:
2419         ESC 0 CHAR [ CHAR ] ESC 1
2420   COMPOSITION_WITH_RULE:
2421         ESC 2 CHAR [ RULE CHAR ] ESC 1
2422   COMPOSITION_WITH_ALTCHARS:
2423         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2424   COMPOSITION_WITH_RULE_ALTCHARS:
2425         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2426
2427 enum iso_code_class_type iso_code_class[256];
2428
2429 #define SAFE_CHARSET_P(coding, id)      \
2430   ((id) <= (coding)->max_charset_id     \
2431    && (coding)->safe_charsets[id] >= 0)
2432
2433
2434 #define SHIFT_OUT_OK(category)  \
2435   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2436
2437 static void
2438 setup_iso_safe_charsets (attrs)
2439      Lisp_Object attrs;
2440 {
2441   Lisp_Object charset_list, safe_charsets;
2442   Lisp_Object request;
2443   Lisp_Object reg_usage;
2444   Lisp_Object tail;
2445   int reg94, reg96;
2446   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2447   int max_charset_id;
2448
2449   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2450   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2451       && ! EQ (charset_list, Viso_2022_charset_list))
2452     {
2453       CODING_ATTR_CHARSET_LIST (attrs)
2454         = charset_list = Viso_2022_charset_list;
2455       ASET (attrs, coding_attr_safe_charsets, Qnil);
2456     }
2457
2458   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2459     return;
2460
2461   max_charset_id = 0;
2462   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2463     {
2464       int id = XINT (XCAR (tail));
2465       if (max_charset_id < id)
2466         max_charset_id = id;
2467     }
2468
2469   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2470                                 make_number (255));
2471   request = AREF (attrs, coding_attr_iso_request);
2472   reg_usage = AREF (attrs, coding_attr_iso_usage);
2473   reg94 = XINT (XCAR (reg_usage));
2474   reg96 = XINT (XCDR (reg_usage));
2475
2476   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2477     {
2478       Lisp_Object id;
2479       Lisp_Object reg;
2480       struct charset *charset;
2481
2482       id = XCAR (tail);
2483       charset = CHARSET_FROM_ID (XINT (id));
2484       reg = Fcdr (Fassq (id, request));
2485       if (! NILP (reg))
2486         SSET (safe_charsets, XINT (id), XINT (reg));
2487       else if (charset->iso_chars_96)
2488         {
2489           if (reg96 < 4)
2490             SSET (safe_charsets, XINT (id), reg96);
2491         }
2492       else
2493         {
2494           if (reg94 < 4)
2495             SSET (safe_charsets, XINT (id), reg94);
2496         }
2497     }
2498   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2499 }
2500
2501
2502 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2503    Check if a text is encoded in one of ISO-2022 based codig systems.
2504    If it is, return 1, else return 0.  */
2505
2506 static int
2507 detect_coding_iso_2022 (coding, detect_info)
2508      struct coding_system *coding;
2509      struct coding_detection_info *detect_info;
2510 {
2511   const unsigned char *src = coding->source, *src_base = src;
2512   const unsigned char *src_end = coding->source + coding->src_bytes;
2513   int multibytep = coding->src_multibyte;
2514   int single_shifting = 0;
2515   int id;
2516   int c, c1;
2517   int consumed_chars = 0;
2518   int i;
2519   int rejected = 0;
2520   int found = 0;
2521
2522   detect_info->checked |= CATEGORY_MASK_ISO;
2523
2524   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2525     {
2526       struct coding_system *this = &(coding_categories[i]);
2527       Lisp_Object attrs, val;
2528
2529       attrs = CODING_ID_ATTRS (this->id);
2530       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2531           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2532         setup_iso_safe_charsets (attrs);
2533       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2534       this->max_charset_id = SCHARS (val) - 1;
2535       this->safe_charsets = (char *) SDATA (val);
2536     }
2537
2538   /* A coding system of this category is always ASCII compatible.  */
2539   src += coding->head_ascii;
2540
2541   while (rejected != CATEGORY_MASK_ISO)
2542     {
2543       src_base = src;
2544       ONE_MORE_BYTE (c);
2545       switch (c)
2546         {
2547         case ISO_CODE_ESC:
2548           if (inhibit_iso_escape_detection)
2549             break;
2550           single_shifting = 0;
2551           ONE_MORE_BYTE (c);
2552           if (c >= '(' && c <= '/')
2553             {
2554               /* Designation sequence for a charset of dimension 1.  */
2555               ONE_MORE_BYTE (c1);
2556               if (c1 < ' ' || c1 >= 0x80
2557                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2558                 /* Invalid designation sequence.  Just ignore.  */
2559                 break;
2560             }
2561           else if (c == '$')
2562             {
2563               /* Designation sequence for a charset of dimension 2.  */
2564               ONE_MORE_BYTE (c);
2565               if (c >= '@' && c <= 'B')
2566                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2567                 id = iso_charset_table[1][0][c];
2568               else if (c >= '(' && c <= '/')
2569                 {
2570                   ONE_MORE_BYTE (c1);
2571                   if (c1 < ' ' || c1 >= 0x80
2572                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2573                     /* Invalid designation sequence.  Just ignore.  */
2574                     break;
2575                 }
2576               else
2577                 /* Invalid designation sequence.  Just ignore it.  */
2578                 break;
2579             }
2580           else if (c == 'N' || c == 'O')
2581             {
2582               /* ESC <Fe> for SS2 or SS3.  */
2583               single_shifting = 1;
2584               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2585               break;
2586             }
2587           else if (c >= '0' && c <= '4')
2588             {
2589               /* ESC <Fp> for start/end composition.  */
2590               found |= CATEGORY_MASK_ISO;
2591               break;
2592             }
2593           else
2594             {
2595               /* Invalid escape sequence.  Just ignore it.  */
2596               break;
2597             }
2598
2599           /* We found a valid designation sequence for CHARSET.  */
2600           rejected |= CATEGORY_MASK_ISO_8BIT;
2601           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2602                               id))
2603             found |= CATEGORY_MASK_ISO_7;
2604           else
2605             rejected |= CATEGORY_MASK_ISO_7;
2606           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2607                               id))
2608             found |= CATEGORY_MASK_ISO_7_TIGHT;
2609           else
2610             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2611           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2612                               id))
2613             found |= CATEGORY_MASK_ISO_7_ELSE;
2614           else
2615             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2616           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2617                               id))
2618             found |= CATEGORY_MASK_ISO_8_ELSE;
2619           else
2620             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2621           break;
2622
2623         case ISO_CODE_SO:
2624         case ISO_CODE_SI:
2625           /* Locking shift out/in.  */
2626           if (inhibit_iso_escape_detection)
2627             break;
2628           single_shifting = 0;
2629           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2630           found |= CATEGORY_MASK_ISO_ELSE;
2631           break;
2632
2633         case ISO_CODE_CSI:
2634           /* Control sequence introducer.  */
2635           single_shifting = 0;
2636           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2637           found |= CATEGORY_MASK_ISO_8_ELSE;
2638           goto check_extra_latin;
2639
2640         case ISO_CODE_SS2:
2641         case ISO_CODE_SS3:
2642           /* Single shift.   */
2643           if (inhibit_iso_escape_detection)
2644             break;
2645           single_shifting = 0;
2646           rejected |= CATEGORY_MASK_ISO_7BIT;
2647           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2648               & CODING_ISO_FLAG_SINGLE_SHIFT)
2649             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2650           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2651               & CODING_ISO_FLAG_SINGLE_SHIFT)
2652             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2653           if (single_shifting)
2654             break;
2655           goto check_extra_latin;
2656
2657         default:
2658           if (c < 0)
2659             continue;
2660           if (c < 0x80)
2661             {
2662               single_shifting = 0;
2663               break;
2664             }
2665           if (c >= 0xA0)
2666             {
2667               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2668               found |= CATEGORY_MASK_ISO_8_1;
2669               /* Check the length of succeeding codes of the range
2670                  0xA0..0FF.  If the byte length is even, we include
2671                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2672                  only when we are not single shifting.  */
2673               if (! single_shifting
2674                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2675                 {
2676                   int i = 1;
2677                   while (src < src_end)
2678                     {
2679                       ONE_MORE_BYTE (c);
2680                       if (c < 0xA0)
2681                         break;
2682                       i++;
2683                     }
2684
2685                   if (i & 1 && src < src_end)
2686                     rejected |= CATEGORY_MASK_ISO_8_2;
2687                   else
2688                     found |= CATEGORY_MASK_ISO_8_2;
2689                 }
2690               break;
2691             }
2692         check_extra_latin:
2693           single_shifting = 0;
2694           if (! VECTORP (Vlatin_extra_code_table)
2695               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2696             {
2697               rejected = CATEGORY_MASK_ISO;
2698               break;
2699             }
2700           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2701               & CODING_ISO_FLAG_LATIN_EXTRA)
2702             found |= CATEGORY_MASK_ISO_8_1;
2703           else
2704             rejected |= CATEGORY_MASK_ISO_8_1;
2705           rejected |= CATEGORY_MASK_ISO_8_2;
2706         }
2707     }
2708   detect_info->rejected |= CATEGORY_MASK_ISO;
2709   return 0;
2710
2711  no_more_source:
2712   detect_info->rejected |= rejected;
2713   detect_info->found |= (found & ~rejected);
2714   return 1;
2715 }
2716
2717
2718 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2719    escape sequence should be kept.  */
2720 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2721   do {                                                                  \
2722     int id, prev;                                                       \
2723                                                                         \
2724     if (final < '0' || final >= 128                                     \
2725         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2726         || !SAFE_CHARSET_P (coding, id))                                \
2727       {                                                                 \
2728         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2729         chars_96 = -1;                                                  \
2730         break;                                                          \
2731       }                                                                 \
2732     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2733     if (id == charset_jisx0201_roman)                                   \
2734       {                                                                 \
2735         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2736           id = charset_ascii;                                           \
2737       }                                                                 \
2738     else if (id == charset_jisx0208_1978)                               \
2739       {                                                                 \
2740         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2741           id = charset_jisx0208;                                        \
2742       }                                                                 \
2743     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2744     /* If there was an invalid designation to REG previously, and this  \
2745        designation is ASCII to REG, we should keep this designation     \
2746        sequence.  */                                                    \
2747     if (prev == -2 && id == charset_ascii)                              \
2748       chars_96 = -1;                                                    \
2749   } while (0)
2750
2751
2752 #define MAYBE_FINISH_COMPOSITION()                              \
2753   do {                                                          \
2754     int i;                                                      \
2755     if (composition_state == COMPOSING_NO)                      \
2756       break;                                                    \
2757     /* It is assured that we have enough room for producing     \
2758        characters stored in the table `components'.  */         \
2759     if (charbuf + component_idx > charbuf_end)                  \
2760       goto no_more_source;                                      \
2761     composition_state = COMPOSING_NO;                           \
2762     if (method == COMPOSITION_RELATIVE                          \
2763         || method == COMPOSITION_WITH_ALTCHARS)                 \
2764       {                                                         \
2765         for (i = 0; i < component_idx; i++)                     \
2766           *charbuf++ = components[i];                           \
2767         char_offset += component_idx;                           \
2768       }                                                         \
2769     else                                                        \
2770       {                                                         \
2771         for (i = 0; i < component_idx; i += 2)                  \
2772           *charbuf++ = components[i];                           \
2773         char_offset += (component_idx / 2) + 1;                 \
2774       }                                                         \
2775   } while (0)
2776
2777
2778 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2779    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2780    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2781    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2782    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2783   */
2784
2785 #define DECODE_COMPOSITION_START(c1)                                    \
2786   do {                                                                  \
2787     if (c1 == '0'                                                       \
2788         && composition_state == COMPOSING_COMPONENT_RULE)               \
2789       {                                                                 \
2790         component_len = component_idx;                                  \
2791         composition_state = COMPOSING_CHAR;                             \
2792       }                                                                 \
2793     else                                                                \
2794       {                                                                 \
2795         const unsigned char *p;                                         \
2796                                                                         \
2797         MAYBE_FINISH_COMPOSITION ();                                    \
2798         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2799           goto no_more_source;                                          \
2800         for (p = src; p < src_end - 1; p++)                             \
2801           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2802             break;                                                      \
2803         if (p == src_end - 1)                                           \
2804           {                                                             \
2805             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2806               goto invalid_code;                                        \
2807             goto no_more_source;                                        \
2808           }                                                             \
2809                                                                         \
2810         /* This is surely the start of a composition.  */               \
2811         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2812                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2813                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2814                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2815         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2816                              : COMPOSING_COMPONENT_CHAR);               \
2817         component_idx = component_len = 0;                              \
2818       }                                                                 \
2819   } while (0)
2820
2821
2822 /* Handle compositoin end sequence ESC 1.  */
2823
2824 #define DECODE_COMPOSITION_END()                                        \
2825   do {                                                                  \
2826     int nchars = (component_len > 0 ? component_idx - component_len     \
2827                   : method == COMPOSITION_RELATIVE ? component_idx      \
2828                   : (component_idx + 1) / 2);                           \
2829     int i;                                                              \
2830     int *saved_charbuf = charbuf;                                       \
2831                                                                         \
2832     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2833     if (method != COMPOSITION_RELATIVE)                                 \
2834       {                                                                 \
2835         if (component_len == 0)                                         \
2836           for (i = 0; i < component_idx; i++)                           \
2837             *charbuf++ = components[i];                                 \
2838         else                                                            \
2839           for (i = 0; i < component_len; i++)                           \
2840             *charbuf++ = components[i];                                 \
2841         *saved_charbuf = saved_charbuf - charbuf;                       \
2842       }                                                                 \
2843     if (method == COMPOSITION_WITH_RULE)                                \
2844       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2845         *charbuf++ = components[i];                                     \
2846     else                                                                \
2847       for (i = component_len; i < component_idx; i++, char_offset++)    \
2848         *charbuf++ = components[i];                                     \
2849     coding->annotated = 1;                                              \
2850     composition_state = COMPOSING_NO;                                   \
2851   } while (0)
2852
2853
2854 /* Decode a composition rule from the byte C1 (and maybe one more byte
2855    from SRC) and store one encoded composition rule in
2856    coding->cmp_data.  */
2857
2858 #define DECODE_COMPOSITION_RULE(c1)                                     \
2859   do {                                                                  \
2860     (c1) -= 32;                                                         \
2861     if (c1 < 81)                /* old format (before ver.21) */        \
2862       {                                                                 \
2863         int gref = (c1) / 9;                                            \
2864         int nref = (c1) % 9;                                            \
2865         if (gref == 4) gref = 10;                                       \
2866         if (nref == 4) nref = 10;                                       \
2867         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2868       }                                                                 \
2869     else if (c1 < 93)           /* new format (after ver.21) */         \
2870       {                                                                 \
2871         ONE_MORE_BYTE (c2);                                             \
2872         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2873       }                                                                 \
2874     else                                                                \
2875       c1 = 0;                                                           \
2876   } while (0)
2877
2878
2879 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2880
2881 static void
2882 decode_coding_iso_2022 (coding)
2883      struct coding_system *coding;
2884 {
2885   const unsigned char *src = coding->source + coding->consumed;
2886   const unsigned char *src_end = coding->source + coding->src_bytes;
2887   const unsigned char *src_base;
2888   int *charbuf = coding->charbuf + coding->charbuf_used;
2889   int *charbuf_end
2890     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2891   int consumed_chars = 0, consumed_chars_base;
2892   int multibytep = coding->src_multibyte;
2893   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2894   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2895   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2896   int charset_id_2, charset_id_3;
2897   struct charset *charset;
2898   int c;
2899   /* For handling composition sequence.  */
2900 #define COMPOSING_NO                    0
2901 #define COMPOSING_CHAR                  1
2902 #define COMPOSING_RULE                  2
2903 #define COMPOSING_COMPONENT_CHAR        3
2904 #define COMPOSING_COMPONENT_RULE        4
2905
2906   int composition_state = COMPOSING_NO;
2907   enum composition_method method;
2908   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2909   int component_idx;
2910   int component_len;
2911   Lisp_Object attrs, charset_list;
2912   int char_offset = coding->produced_char;
2913   int last_offset = char_offset;
2914   int last_id = charset_ascii;
2915
2916   CODING_GET_INFO (coding, attrs, charset_list);
2917   setup_iso_safe_charsets (attrs);
2918
2919   while (1)
2920     {
2921       int c1, c2;
2922
2923       src_base = src;
2924       consumed_chars_base = consumed_chars;
2925
2926       if (charbuf >= charbuf_end)
2927         break;
2928
2929       ONE_MORE_BYTE (c1);
2930       if (c1 < 0)
2931         goto invalid_code;
2932
2933       /* We produce at most one character.  */
2934       switch (iso_code_class [c1])
2935         {
2936         case ISO_0x20_or_0x7F:
2937           if (composition_state != COMPOSING_NO)
2938             {
2939               if (composition_state == COMPOSING_RULE
2940                   || composition_state == COMPOSING_COMPONENT_RULE)
2941                 {
2942                   DECODE_COMPOSITION_RULE (c1);
2943                   components[component_idx++] = c1;
2944                   composition_state--;
2945                   continue;
2946                 }
2947             }
2948           if (charset_id_0 < 0
2949               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2950             /* This is SPACE or DEL.  */
2951             charset = CHARSET_FROM_ID (charset_ascii);
2952           else
2953             charset = CHARSET_FROM_ID (charset_id_0);
2954           break;
2955
2956         case ISO_graphic_plane_0:
2957           if (composition_state != COMPOSING_NO)
2958             {
2959               if (composition_state == COMPOSING_RULE
2960                   || composition_state == COMPOSING_COMPONENT_RULE)
2961                 {
2962                   DECODE_COMPOSITION_RULE (c1);
2963                   components[component_idx++] = c1;
2964                   composition_state--;
2965                   continue;
2966                 }
2967             }
2968           if (charset_id_0 < 0)
2969             charset = CHARSET_FROM_ID (charset_ascii);
2970           else
2971             charset = CHARSET_FROM_ID (charset_id_0);
2972           break;
2973
2974         case ISO_0xA0_or_0xFF:
2975           if (charset_id_1 < 0
2976               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2977               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2978             goto invalid_code;
2979           /* This is a graphic character, we fall down ... */
2980
2981         case ISO_graphic_plane_1:
2982           if (charset_id_1 < 0)
2983             goto invalid_code;
2984           charset = CHARSET_FROM_ID (charset_id_1);
2985           break;
2986
2987         case ISO_control_0:
2988           MAYBE_FINISH_COMPOSITION ();
2989           charset = CHARSET_FROM_ID (charset_ascii);
2990           break;
2991
2992         case ISO_control_1:
2993           MAYBE_FINISH_COMPOSITION ();
2994           goto invalid_code;
2995
2996         case ISO_shift_out:
2997           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2998               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2999             goto invalid_code;
3000           CODING_ISO_INVOCATION (coding, 0) = 1;
3001           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3002           continue;
3003
3004         case ISO_shift_in:
3005           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3006             goto invalid_code;
3007           CODING_ISO_INVOCATION (coding, 0) = 0;
3008           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3009           continue;
3010
3011         case ISO_single_shift_2_7:
3012         case ISO_single_shift_2:
3013           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3014             goto invalid_code;
3015           /* SS2 is handled as an escape sequence of ESC 'N' */
3016           c1 = 'N';
3017           goto label_escape_sequence;
3018
3019         case ISO_single_shift_3:
3020           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3021             goto invalid_code;
3022           /* SS2 is handled as an escape sequence of ESC 'O' */
3023           c1 = 'O';
3024           goto label_escape_sequence;
3025
3026         case ISO_control_sequence_introducer:
3027           /* CSI is handled as an escape sequence of ESC '[' ...  */
3028           c1 = '[';
3029           goto label_escape_sequence;
3030
3031         case ISO_escape:
3032           ONE_MORE_BYTE (c1);
3033         label_escape_sequence:
3034           /* Escape sequences handled here are invocation,
3035              designation, direction specification, and character
3036              composition specification.  */
3037           switch (c1)
3038             {
3039             case '&':           /* revision of following character set */
3040               ONE_MORE_BYTE (c1);
3041               if (!(c1 >= '@' && c1 <= '~'))
3042                 goto invalid_code;
3043               ONE_MORE_BYTE (c1);
3044               if (c1 != ISO_CODE_ESC)
3045                 goto invalid_code;
3046               ONE_MORE_BYTE (c1);
3047               goto label_escape_sequence;
3048
3049             case '$':           /* designation of 2-byte character set */
3050               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3051                 goto invalid_code;
3052               {
3053                 int reg, chars96;
3054
3055                 ONE_MORE_BYTE (c1);
3056                 if (c1 >= '@' && c1 <= 'B')
3057                   {     /* designation of JISX0208.1978, GB2312.1980,
3058                            or JISX0208.1980 */
3059                     reg = 0, chars96 = 0;
3060                   }
3061                 else if (c1 >= 0x28 && c1 <= 0x2B)
3062                   { /* designation of DIMENSION2_CHARS94 character set */
3063                     reg = c1 - 0x28, chars96 = 0;
3064                     ONE_MORE_BYTE (c1);
3065                   }
3066                 else if (c1 >= 0x2C && c1 <= 0x2F)
3067                   { /* designation of DIMENSION2_CHARS96 character set */
3068                     reg = c1 - 0x2C, chars96 = 1;
3069                     ONE_MORE_BYTE (c1);
3070                   }
3071                 else
3072                   goto invalid_code;
3073                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3074                 /* We must update these variables now.  */
3075                 if (reg == 0)
3076                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3077                 else if (reg == 1)
3078                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3079                 if (chars96 < 0)
3080                   goto invalid_code;
3081               }
3082               continue;
3083
3084             case 'n':           /* invocation of locking-shift-2 */
3085               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3086                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3087                 goto invalid_code;
3088               CODING_ISO_INVOCATION (coding, 0) = 2;
3089               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3090               continue;
3091
3092             case 'o':           /* invocation of locking-shift-3 */
3093               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3094                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3095                 goto invalid_code;
3096               CODING_ISO_INVOCATION (coding, 0) = 3;
3097               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3098               continue;
3099
3100             case 'N':           /* invocation of single-shift-2 */
3101               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3102                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3103                 goto invalid_code;
3104               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3105               if (charset_id_2 < 0)
3106                 charset = CHARSET_FROM_ID (charset_ascii);
3107               else
3108                 charset = CHARSET_FROM_ID (charset_id_2);
3109               ONE_MORE_BYTE (c1);
3110               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3111                 goto invalid_code;
3112               break;
3113
3114             case 'O':           /* invocation of single-shift-3 */
3115               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3116                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3117                 goto invalid_code;
3118               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3119               if (charset_id_3 < 0)
3120                 charset = CHARSET_FROM_ID (charset_ascii);
3121               else
3122                 charset = CHARSET_FROM_ID (charset_id_3);
3123               ONE_MORE_BYTE (c1);
3124               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3125                 goto invalid_code;
3126               break;
3127
3128             case '0': case '2': case '3': case '4': /* start composition */
3129               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3130                 goto invalid_code;
3131               DECODE_COMPOSITION_START (c1);
3132               continue;
3133
3134             case '1':           /* end composition */
3135               if (composition_state == COMPOSING_NO)
3136                 goto invalid_code;
3137               DECODE_COMPOSITION_END ();
3138               continue;
3139
3140             case '[':           /* specification of direction */
3141               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3142                 goto invalid_code;
3143               /* For the moment, nested direction is not supported.
3144                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3145                  left-to-right, and nozero means right-to-left.  */
3146               ONE_MORE_BYTE (c1);
3147               switch (c1)
3148                 {
3149                 case ']':       /* end of the current direction */
3150                   coding->mode &= ~CODING_MODE_DIRECTION;
3151
3152                 case '0':       /* end of the current direction */
3153                 case '1':       /* start of left-to-right direction */
3154                   ONE_MORE_BYTE (c1);
3155                   if (c1 == ']')
3156                     coding->mode &= ~CODING_MODE_DIRECTION;
3157                   else
3158                     goto invalid_code;
3159                   break;
3160
3161                 case '2':       /* start of right-to-left direction */
3162                   ONE_MORE_BYTE (c1);
3163                   if (c1 == ']')
3164                     coding->mode |= CODING_MODE_DIRECTION;
3165                   else
3166                     goto invalid_code;
3167                   break;
3168
3169                 default:
3170                   goto invalid_code;
3171                 }
3172               continue;
3173
3174             case '%':
3175               ONE_MORE_BYTE (c1);
3176               if (c1 == '/')
3177                 {
3178                   /* CTEXT extended segment:
3179                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3180                      We keep these bytes as is for the moment.
3181                      They may be decoded by post-read-conversion.  */
3182                   int dim, M, L;
3183                   int size;
3184
3185                   ONE_MORE_BYTE (dim);
3186                   ONE_MORE_BYTE (M);
3187                   ONE_MORE_BYTE (L);
3188                   size = ((M - 128) * 128) + (L - 128);
3189                   if (charbuf + 8 + size > charbuf_end)
3190                     goto break_loop;
3191                   *charbuf++ = ISO_CODE_ESC;
3192                   *charbuf++ = '%';
3193                   *charbuf++ = '/';
3194                   *charbuf++ = dim;
3195                   *charbuf++ = BYTE8_TO_CHAR (M);
3196                   *charbuf++ = BYTE8_TO_CHAR (L);
3197                   while (size-- > 0)
3198                     {
3199                       ONE_MORE_BYTE (c1);
3200                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3201                     }
3202                 }
3203               else if (c1 == 'G')
3204                 {
3205                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3206                      ESC % G --UTF-8-BYTES-- ESC % @
3207                      We keep these bytes as is for the moment.
3208                      They may be decoded by post-read-conversion.  */
3209                   int *p = charbuf;
3210
3211                   if (p + 6 > charbuf_end)
3212                     goto break_loop;
3213                   *p++ = ISO_CODE_ESC;
3214                   *p++ = '%';
3215                   *p++ = 'G';
3216                   while (p < charbuf_end)
3217                     {
3218                       ONE_MORE_BYTE (c1);
3219                       if (c1 == ISO_CODE_ESC
3220                           && src + 1 < src_end
3221                           && src[0] == '%'
3222                           && src[1] == '@')
3223                         {
3224                           src += 2;
3225                           break;
3226                         }
3227                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3228                     }
3229                   if (p + 3 > charbuf_end)
3230                     goto break_loop;
3231                   *p++ = ISO_CODE_ESC;
3232                   *p++ = '%';
3233                   *p++ = '@';
3234                   charbuf = p;
3235                 }
3236               else
3237                 goto invalid_code;
3238               continue;
3239               break;
3240
3241             default:
3242               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3243                 goto invalid_code;
3244               {
3245                 int reg, chars96;
3246
3247                 if (c1 >= 0x28 && c1 <= 0x2B)
3248                   { /* designation of DIMENSION1_CHARS94 character set */
3249                     reg = c1 - 0x28, chars96 = 0;
3250                     ONE_MORE_BYTE (c1);
3251                   }
3252                 else if (c1 >= 0x2C && c1 <= 0x2F)
3253                   { /* designation of DIMENSION1_CHARS96 character set */
3254                     reg = c1 - 0x2C, chars96 = 1;
3255                     ONE_MORE_BYTE (c1);
3256                   }
3257                 else
3258                   goto invalid_code;
3259                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3260                 /* We must update these variables now.  */
3261                 if (reg == 0)
3262                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3263                 else if (reg == 1)
3264                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3265                 if (chars96 < 0)
3266                   goto invalid_code;
3267               }
3268               continue;
3269             }
3270         }
3271
3272       if (charset->id != charset_ascii
3273           && last_id != charset->id)
3274         {
3275           if (last_id != charset_ascii)
3276             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3277           last_id = charset->id;
3278           last_offset = char_offset;
3279         }
3280
3281       /* Now we know CHARSET and 1st position code C1 of a character.
3282          Produce a decoded character while getting 2nd position code
3283          C2 if necessary.  */
3284       c1 &= 0x7F;
3285       if (CHARSET_DIMENSION (charset) > 1)
3286         {
3287           ONE_MORE_BYTE (c2);
3288           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3289             /* C2 is not in a valid range.  */
3290             goto invalid_code;
3291           c1 = (c1 << 8) | (c2 & 0x7F);
3292           if (CHARSET_DIMENSION (charset) > 2)
3293             {
3294               ONE_MORE_BYTE (c2);
3295               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3296                 /* C2 is not in a valid range.  */
3297                 goto invalid_code;
3298               c1 = (c1 << 8) | (c2 & 0x7F);
3299             }
3300         }
3301
3302       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3303       if (c < 0)
3304         {
3305           MAYBE_FINISH_COMPOSITION ();
3306           for (; src_base < src; src_base++, char_offset++)
3307             {
3308               if (ASCII_BYTE_P (*src_base))
3309                 *charbuf++ = *src_base;
3310               else
3311                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3312             }
3313         }
3314       else if (composition_state == COMPOSING_NO)
3315         {
3316           *charbuf++ = c;
3317           char_offset++;
3318         }
3319       else
3320         {
3321           components[component_idx++] = c;
3322           if (method == COMPOSITION_WITH_RULE
3323               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3324                   && composition_state == COMPOSING_COMPONENT_CHAR))
3325             composition_state++;
3326         }
3327       continue;
3328
3329     invalid_code:
3330       MAYBE_FINISH_COMPOSITION ();
3331       src = src_base;
3332       consumed_chars = consumed_chars_base;
3333       ONE_MORE_BYTE (c);
3334       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3335       char_offset++;
3336       coding->errors++;
3337       continue;
3338
3339     break_loop:
3340       break;
3341     }
3342
3343  no_more_source:
3344   if (last_id != charset_ascii)
3345     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3346   coding->consumed_char += consumed_chars_base;
3347   coding->consumed = src_base - coding->source;
3348   coding->charbuf_used = charbuf - coding->charbuf;
3349 }
3350
3351
3352 /* ISO2022 encoding stuff.  */
3353
3354 /*
3355    It is not enough to say just "ISO2022" on encoding, we have to
3356    specify more details.  In Emacs, each coding system of ISO2022
3357    variant has the following specifications:
3358         1. Initial designation to G0 thru G3.
3359         2. Allows short-form designation?
3360         3. ASCII should be designated to G0 before control characters?
3361         4. ASCII should be designated to G0 at end of line?
3362         5. 7-bit environment or 8-bit environment?
3363         6. Use locking-shift?
3364         7. Use Single-shift?
3365    And the following two are only for Japanese:
3366         8. Use ASCII in place of JIS0201-1976-Roman?
3367         9. Use JISX0208-1983 in place of JISX0208-1978?
3368    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3369    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3370    details.
3371 */
3372
3373 /* Produce codes (escape sequence) for designating CHARSET to graphic
3374    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3375    '@', 'A', or 'B' and the coding system CODING allows, produce
3376    designation sequence of short-form.  */
3377
3378 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3379   do {                                                                  \
3380     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3381     char *intermediate_char_94 = "()*+";                                \
3382     char *intermediate_char_96 = ",-./";                                \
3383     int revision = -1;                                                  \
3384     int c;                                                              \
3385                                                                         \
3386     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3387       revision = CHARSET_ISO_REVISION (charset);                        \
3388                                                                         \
3389     if (revision >= 0)                                                  \
3390       {                                                                 \
3391         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3392         EMIT_ONE_BYTE ('@' + revision);                                 \
3393       }                                                                 \
3394     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3395     if (CHARSET_DIMENSION (charset) == 1)                               \
3396       {                                                                 \
3397         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3398           c = intermediate_char_94[reg];                                \
3399         else                                                            \
3400           c = intermediate_char_96[reg];                                \
3401         EMIT_ONE_ASCII_BYTE (c);                                        \
3402       }                                                                 \
3403     else                                                                \
3404       {                                                                 \
3405         EMIT_ONE_ASCII_BYTE ('$');                                      \
3406         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3407           {                                                             \
3408             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3409                 || reg != 0                                             \
3410                 || final_char < '@' || final_char > 'B')                \
3411               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3412           }                                                             \
3413         else                                                            \
3414           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3415       }                                                                 \
3416     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3417                                                                         \
3418     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3419   } while (0)
3420
3421
3422 /* The following two macros produce codes (control character or escape
3423    sequence) for ISO2022 single-shift functions (single-shift-2 and
3424    single-shift-3).  */
3425
3426 #define ENCODE_SINGLE_SHIFT_2                                           \
3427   do {                                                                  \
3428     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3429       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3430     else                                                                \
3431       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3432     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3433   } while (0)
3434
3435
3436 #define ENCODE_SINGLE_SHIFT_3                                           \
3437   do {                                                                  \
3438     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3439       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3440     else                                                                \
3441       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3442     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3443   } while (0)
3444
3445
3446 /* The following four macros produce codes (control character or
3447    escape sequence) for ISO2022 locking-shift functions (shift-in,
3448    shift-out, locking-shift-2, and locking-shift-3).  */
3449
3450 #define ENCODE_SHIFT_IN                                 \
3451   do {                                                  \
3452     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3453     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3454   } while (0)
3455
3456
3457 #define ENCODE_SHIFT_OUT                                \
3458   do {                                                  \
3459     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3460     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3461   } while (0)
3462
3463
3464 #define ENCODE_LOCKING_SHIFT_2                          \
3465   do {                                                  \
3466     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3467     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3468   } while (0)
3469
3470
3471 #define ENCODE_LOCKING_SHIFT_3                          \
3472   do {                                                  \
3473     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3474     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3475   } while (0)
3476
3477
3478 /* Produce codes for a DIMENSION1 character whose character set is
3479    CHARSET and whose position-code is C1.  Designation and invocation
3480    sequences are also produced in advance if necessary.  */
3481
3482 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3483   do {                                                                  \
3484     int id = CHARSET_ID (charset);                                      \
3485                                                                         \
3486     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3487         && id == charset_ascii)                                         \
3488       {                                                                 \
3489         id = charset_jisx0201_roman;                                    \
3490         charset = CHARSET_FROM_ID (id);                                 \
3491       }                                                                 \
3492                                                                         \
3493     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3494       {                                                                 \
3495         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3496           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3497         else                                                            \
3498           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3499         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3500         break;                                                          \
3501       }                                                                 \
3502     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3503       {                                                                 \
3504         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3505         break;                                                          \
3506       }                                                                 \
3507     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3508       {                                                                 \
3509         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3510         break;                                                          \
3511       }                                                                 \
3512     else                                                                \
3513       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3514          must invoke it, or, at first, designate it to some graphic     \
3515          register.  Then repeat the loop to actually produce the        \
3516          character.  */                                                 \
3517       dst = encode_invocation_designation (charset, coding, dst,        \
3518                                            &produced_chars);            \
3519   } while (1)
3520
3521
3522 /* Produce codes for a DIMENSION2 character whose character set is
3523    CHARSET and whose position-codes are C1 and C2.  Designation and
3524    invocation codes are also produced in advance if necessary.  */
3525
3526 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3527   do {                                                                  \
3528     int id = CHARSET_ID (charset);                                      \
3529                                                                         \
3530     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3531         && id == charset_jisx0208)                                      \
3532       {                                                                 \
3533         id = charset_jisx0208_1978;                                     \
3534         charset = CHARSET_FROM_ID (id);                                 \
3535       }                                                                 \
3536                                                                         \
3537     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3538       {                                                                 \
3539         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3540           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3541         else                                                            \
3542           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3543         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3544         break;                                                          \
3545       }                                                                 \
3546     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3547       {                                                                 \
3548         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3549         break;                                                          \
3550       }                                                                 \
3551     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3552       {                                                                 \
3553         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3554         break;                                                          \
3555       }                                                                 \
3556     else                                                                \
3557       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3558          must invoke it, or, at first, designate it to some graphic     \
3559          register.  Then repeat the loop to actually produce the        \
3560          character.  */                                                 \
3561       dst = encode_invocation_designation (charset, coding, dst,        \
3562                                            &produced_chars);            \
3563   } while (1)
3564
3565
3566 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3567   do {                                                                     \
3568     int code = ENCODE_CHAR ((charset),(c));                                \
3569                                                                            \
3570     if (CHARSET_DIMENSION (charset) == 1)                                  \
3571       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3572     else                                                                   \
3573       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3574   } while (0)
3575
3576
3577 /* Produce designation and invocation codes at a place pointed by DST
3578    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3579    Return new DST.  */
3580
3581 unsigned char *
3582 encode_invocation_designation (charset, coding, dst, p_nchars)
3583      struct charset *charset;
3584      struct coding_system *coding;
3585      unsigned char *dst;
3586      int *p_nchars;
3587 {
3588   int multibytep = coding->dst_multibyte;
3589   int produced_chars = *p_nchars;
3590   int reg;                      /* graphic register number */
3591   int id = CHARSET_ID (charset);
3592
3593   /* At first, check designations.  */
3594   for (reg = 0; reg < 4; reg++)
3595     if (id == CODING_ISO_DESIGNATION (coding, reg))
3596       break;
3597
3598   if (reg >= 4)
3599     {
3600       /* CHARSET is not yet designated to any graphic registers.  */
3601       /* At first check the requested designation.  */
3602       reg = CODING_ISO_REQUEST (coding, id);
3603       if (reg < 0)
3604         /* Since CHARSET requests no special designation, designate it
3605            to graphic register 0.  */
3606         reg = 0;
3607
3608       ENCODE_DESIGNATION (charset, reg, coding);
3609     }
3610
3611   if (CODING_ISO_INVOCATION (coding, 0) != reg
3612       && CODING_ISO_INVOCATION (coding, 1) != reg)
3613     {
3614       /* Since the graphic register REG is not invoked to any graphic
3615          planes, invoke it to graphic plane 0.  */
3616       switch (reg)
3617         {
3618         case 0:                 /* graphic register 0 */
3619           ENCODE_SHIFT_IN;
3620           break;
3621
3622         case 1:                 /* graphic register 1 */
3623           ENCODE_SHIFT_OUT;
3624           break;
3625
3626         case 2:                 /* graphic register 2 */
3627           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3628             ENCODE_SINGLE_SHIFT_2;
3629           else
3630             ENCODE_LOCKING_SHIFT_2;
3631           break;
3632
3633         case 3:                 /* graphic register 3 */
3634           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3635             ENCODE_SINGLE_SHIFT_3;
3636           else
3637             ENCODE_LOCKING_SHIFT_3;
3638           break;
3639         }
3640     }
3641
3642   *p_nchars = produced_chars;
3643   return dst;
3644 }
3645
3646 /* The following three macros produce codes for indicating direction
3647    of text.  */
3648 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3649   do {                                                                  \
3650     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3651       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3652     else                                                                \
3653       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3654   } while (0)
3655
3656
3657 #define ENCODE_DIRECTION_R2L()                  \
3658   do {                                          \
3659     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3660     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3661   } while (0)
3662
3663
3664 #define ENCODE_DIRECTION_L2R()                  \
3665   do {                                          \
3666     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3667     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3668   } while (0)
3669
3670
3671 /* Produce codes for designation and invocation to reset the graphic
3672    planes and registers to initial state.  */
3673 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3674   do {                                                                  \
3675     int reg;                                                            \
3676     struct charset *charset;                                            \
3677                                                                         \
3678     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3679       ENCODE_SHIFT_IN;                                                  \
3680     for (reg = 0; reg < 4; reg++)                                       \
3681       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3682           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3683               != CODING_ISO_INITIAL (coding, reg)))                     \
3684         {                                                               \
3685           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3686           ENCODE_DESIGNATION (charset, reg, coding);                    \
3687         }                                                               \
3688   } while (0)
3689
3690
3691 /* Produce designation sequences of charsets in the line started from
3692    SRC to a place pointed by DST, and return updated DST.
3693
3694    If the current block ends before any end-of-line, we may fail to
3695    find all the necessary designations.  */
3696
3697 static unsigned char *
3698 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3699      struct coding_system *coding;
3700      int *charbuf, *charbuf_end;
3701      unsigned char *dst;
3702 {
3703   struct charset *charset;
3704   /* Table of charsets to be designated to each graphic register.  */
3705   int r[4];
3706   int c, found = 0, reg;
3707   int produced_chars = 0;
3708   int multibytep = coding->dst_multibyte;
3709   Lisp_Object attrs;
3710   Lisp_Object charset_list;
3711
3712   attrs = CODING_ID_ATTRS (coding->id);
3713   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3714   if (EQ (charset_list, Qiso_2022))
3715     charset_list = Viso_2022_charset_list;
3716
3717   for (reg = 0; reg < 4; reg++)
3718     r[reg] = -1;
3719
3720   while (found < 4)
3721     {
3722       int id;
3723
3724       c = *charbuf++;
3725       if (c == '\n')
3726         break;
3727       charset = char_charset (c, charset_list, NULL);
3728       id = CHARSET_ID (charset);
3729       reg = CODING_ISO_REQUEST (coding, id);
3730       if (reg >= 0 && r[reg] < 0)
3731         {
3732           found++;
3733           r[reg] = id;
3734         }
3735     }
3736
3737   if (found)
3738     {
3739       for (reg = 0; reg < 4; reg++)
3740         if (r[reg] >= 0
3741             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3742           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3743     }
3744
3745   return dst;
3746 }
3747
3748 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3749
3750 static int
3751 encode_coding_iso_2022 (coding)
3752      struct coding_system *coding;
3753 {
3754   int multibytep = coding->dst_multibyte;
3755   int *charbuf = coding->charbuf;
3756   int *charbuf_end = charbuf + coding->charbuf_used;
3757   unsigned char *dst = coding->destination + coding->produced;
3758   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3759   int safe_room = 16;
3760   int bol_designation
3761     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3762        && CODING_ISO_BOL (coding));
3763   int produced_chars = 0;
3764   Lisp_Object attrs, eol_type, charset_list;
3765   int ascii_compatible;
3766   int c;
3767   int preferred_charset_id = -1;
3768
3769   CODING_GET_INFO (coding, attrs, charset_list);
3770   eol_type = CODING_ID_EOL_TYPE (coding->id);
3771   if (VECTORP (eol_type))
3772     eol_type = Qunix;
3773
3774   setup_iso_safe_charsets (attrs);
3775   /* Charset list may have been changed.  */
3776   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3777   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3778
3779   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3780
3781   while (charbuf < charbuf_end)
3782     {
3783       ASSURE_DESTINATION (safe_room);
3784
3785       if (bol_designation)
3786         {
3787           unsigned char *dst_prev = dst;
3788
3789           /* We have to produce designation sequences if any now.  */
3790           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3791           bol_designation = 0;
3792           /* We are sure that designation sequences are all ASCII bytes.  */
3793           produced_chars += dst - dst_prev;
3794         }
3795
3796       c = *charbuf++;
3797
3798       if (c < 0)
3799         {
3800           /* Handle an annotation.  */
3801           switch (*charbuf)
3802             {
3803             case CODING_ANNOTATE_COMPOSITION_MASK:
3804               /* Not yet implemented.  */
3805               break;
3806             case CODING_ANNOTATE_CHARSET_MASK:
3807               preferred_charset_id = charbuf[2];
3808               if (preferred_charset_id >= 0
3809                   && NILP (Fmemq (make_number (preferred_charset_id),
3810                                   charset_list)))
3811                 preferred_charset_id = -1;
3812               break;
3813             default:
3814               abort ();
3815             }
3816           charbuf += -c - 1;
3817           continue;
3818         }
3819
3820       /* Now encode the character C.  */
3821       if (c < 0x20 || c == 0x7F)
3822         {
3823           if (c == '\n'
3824               || (c == '\r' && EQ (eol_type, Qmac)))
3825             {
3826               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3827                 ENCODE_RESET_PLANE_AND_REGISTER ();
3828               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3829                 {
3830                   int i;
3831
3832                   for (i = 0; i < 4; i++)
3833                     CODING_ISO_DESIGNATION (coding, i)
3834                       = CODING_ISO_INITIAL (coding, i);
3835                 }
3836               bol_designation
3837                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3838             }
3839           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3840             ENCODE_RESET_PLANE_AND_REGISTER ();
3841           EMIT_ONE_ASCII_BYTE (c);
3842         }
3843       else if (ASCII_CHAR_P (c))
3844         {
3845           if (ascii_compatible)
3846             EMIT_ONE_ASCII_BYTE (c);
3847           else
3848             {
3849               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3850               ENCODE_ISO_CHARACTER (charset, c);
3851             }
3852         }
3853       else if (CHAR_BYTE8_P (c))
3854         {
3855           c = CHAR_TO_BYTE8 (c);
3856           EMIT_ONE_BYTE (c);
3857         }
3858       else
3859         {
3860           struct charset *charset;
3861
3862           if (preferred_charset_id >= 0)
3863             {
3864               charset = CHARSET_FROM_ID (preferred_charset_id);
3865               if (! CHAR_CHARSET_P (c, charset))
3866                 charset = char_charset (c, charset_list, NULL);
3867             }
3868           else
3869             charset = char_charset (c, charset_list, NULL);
3870           if (!charset)
3871             {
3872               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3873                 {
3874                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3875                   charset = CHARSET_FROM_ID (charset_ascii);
3876                 }
3877               else
3878                 {
3879                   c = coding->default_char;
3880                   charset = char_charset (c, charset_list, NULL);
3881                 }
3882             }
3883           ENCODE_ISO_CHARACTER (charset, c);
3884         }
3885     }
3886
3887   if (coding->mode & CODING_MODE_LAST_BLOCK
3888       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3889     {
3890       ASSURE_DESTINATION (safe_room);
3891       ENCODE_RESET_PLANE_AND_REGISTER ();
3892     }
3893   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3894   CODING_ISO_BOL (coding) = bol_designation;
3895   coding->produced_char += produced_chars;
3896   coding->produced = dst - coding->destination;
3897   return 0;
3898 }
3899
3900 \f
3901 /*** 8,9. SJIS and BIG5 handlers ***/
3902
3903 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3904    quite widely.  So, for the moment, Emacs supports them in the bare
3905    C code.  But, in the future, they may be supported only by CCL.  */
3906
3907 /* SJIS is a coding system encoding three character sets: ASCII, right
3908    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3909    as is.  A character of charset katakana-jisx0201 is encoded by
3910    "position-code + 0x80".  A character of charset japanese-jisx0208
3911    is encoded in 2-byte but two position-codes are divided and shifted
3912    so that it fit in the range below.
3913
3914    --- CODE RANGE of SJIS ---
3915    (character set)      (range)
3916    ASCII                0x00 .. 0x7F
3917    KATAKANA-JISX0201    0xA0 .. 0xDF
3918    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3919             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3920    -------------------------------
3921
3922 */
3923
3924 /* BIG5 is a coding system encoding two character sets: ASCII and
3925    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3926    character set and is encoded in two-byte.
3927
3928    --- CODE RANGE of BIG5 ---
3929    (character set)      (range)
3930    ASCII                0x00 .. 0x7F
3931    Big5 (1st byte)      0xA1 .. 0xFE
3932         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3933    --------------------------
3934
3935   */
3936
3937 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3938    Check if a text is encoded in SJIS.  If it is, return
3939    CATEGORY_MASK_SJIS, else return 0.  */
3940
3941 static int
3942 detect_coding_sjis (coding, detect_info)
3943      struct coding_system *coding;
3944      struct coding_detection_info *detect_info;
3945 {
3946   const unsigned char *src = coding->source, *src_base;
3947   const unsigned char *src_end = coding->source + coding->src_bytes;
3948   int multibytep = coding->src_multibyte;
3949   int consumed_chars = 0;
3950   int found = 0;
3951   int c;
3952
3953   detect_info->checked |= CATEGORY_MASK_SJIS;
3954   /* A coding system of this category is always ASCII compatible.  */
3955   src += coding->head_ascii;
3956
3957   while (1)
3958     {
3959       src_base = src;
3960       ONE_MORE_BYTE (c);
3961       if (c < 0x80)
3962         continue;
3963       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3964         {
3965           ONE_MORE_BYTE (c);
3966           if (c < 0x40 || c == 0x7F || c > 0xFC)
3967             break;
3968           found = CATEGORY_MASK_SJIS;
3969         }
3970       else if (c >= 0xA0 && c < 0xE0)
3971         found = CATEGORY_MASK_SJIS;
3972       else
3973         break;
3974     }
3975   detect_info->rejected |= CATEGORY_MASK_SJIS;
3976   return 0;
3977
3978  no_more_source:
3979   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3980     {
3981       detect_info->rejected |= CATEGORY_MASK_SJIS;
3982       return 0;
3983     }
3984   detect_info->found |= found;
3985   return 1;
3986 }
3987
3988 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3989    Check if a text is encoded in BIG5.  If it is, return
3990    CATEGORY_MASK_BIG5, else return 0.  */
3991
3992 static int
3993 detect_coding_big5 (coding, detect_info)
3994      struct coding_system *coding;
3995      struct coding_detection_info *detect_info;
3996 {
3997   const unsigned char *src = coding->source, *src_base;
3998   const unsigned char *src_end = coding->source + coding->src_bytes;
3999   int multibytep = coding->src_multibyte;
4000   int consumed_chars = 0;
4001   int found = 0;
4002   int c;
4003
4004   detect_info->checked |= CATEGORY_MASK_BIG5;
4005   /* A coding system of this category is always ASCII compatible.  */
4006   src += coding->head_ascii;
4007
4008   while (1)
4009     {
4010       src_base = src;
4011       ONE_MORE_BYTE (c);
4012       if (c < 0x80)
4013         continue;
4014       if (c >= 0xA1)
4015         {
4016           ONE_MORE_BYTE (c);
4017           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4018             return 0;
4019           found = CATEGORY_MASK_BIG5;
4020         }
4021       else
4022         break;
4023     }
4024   detect_info->rejected |= CATEGORY_MASK_BIG5;
4025   return 0;
4026
4027  no_more_source:
4028   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4029     {
4030       detect_info->rejected |= CATEGORY_MASK_BIG5;
4031       return 0;
4032     }
4033   detect_info->found |= found;
4034   return 1;
4035 }
4036
4037 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4038    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4039
4040 static void
4041 decode_coding_sjis (coding)
4042      struct coding_system *coding;
4043 {
4044   const unsigned char *src = coding->source + coding->consumed;
4045   const unsigned char *src_end = coding->source + coding->src_bytes;
4046   const unsigned char *src_base;
4047   int *charbuf = coding->charbuf + coding->charbuf_used;
4048   int *charbuf_end
4049     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4050   int consumed_chars = 0, consumed_chars_base;
4051   int multibytep = coding->src_multibyte;
4052   struct charset *charset_roman, *charset_kanji, *charset_kana;
4053   struct charset *charset_kanji2;
4054   Lisp_Object attrs, charset_list, val;
4055   int char_offset = coding->produced_char;
4056   int last_offset = char_offset;
4057   int last_id = charset_ascii;
4058
4059   CODING_GET_INFO (coding, attrs, charset_list);
4060
4061   val = charset_list;
4062   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4063   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4064   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4065   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4066
4067   while (1)
4068     {
4069       int c, c1;
4070       struct charset *charset;
4071
4072       src_base = src;
4073       consumed_chars_base = consumed_chars;
4074
4075       if (charbuf >= charbuf_end)
4076         break;
4077
4078       ONE_MORE_BYTE (c);
4079       if (c < 0)
4080         goto invalid_code;
4081       if (c < 0x80)
4082         charset = charset_roman;
4083       else if (c == 0x80 || c == 0xA0)
4084         goto invalid_code;
4085       else if (c >= 0xA1 && c <= 0xDF)
4086         {
4087           /* SJIS -> JISX0201-Kana */
4088           c &= 0x7F;
4089           charset = charset_kana;
4090         }
4091       else if (c <= 0xEF)
4092         {
4093           /* SJIS -> JISX0208 */
4094           ONE_MORE_BYTE (c1);
4095           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4096             goto invalid_code;
4097           c = (c << 8) | c1;
4098           SJIS_TO_JIS (c);
4099           charset = charset_kanji;
4100         }
4101       else if (c <= 0xFC && charset_kanji2)
4102         {
4103           /* SJIS -> JISX0213-2 */
4104           ONE_MORE_BYTE (c1);
4105           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4106             goto invalid_code;
4107           c = (c << 8) | c1;
4108           SJIS_TO_JIS2 (c);
4109           charset = charset_kanji2;
4110         }
4111       else
4112         goto invalid_code;
4113       if (charset->id != charset_ascii
4114           && last_id != charset->id)
4115         {
4116           if (last_id != charset_ascii)
4117             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4118           last_id = charset->id;
4119           last_offset = char_offset;
4120         }
4121       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4122       *charbuf++ = c;
4123       char_offset++;
4124       continue;
4125
4126     invalid_code:
4127       src = src_base;
4128       consumed_chars = consumed_chars_base;
4129       ONE_MORE_BYTE (c);
4130       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4131       char_offset++;
4132       coding->errors++;
4133     }
4134
4135  no_more_source:
4136   if (last_id != charset_ascii)
4137     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4138   coding->consumed_char += consumed_chars_base;
4139   coding->consumed = src_base - coding->source;
4140   coding->charbuf_used = charbuf - coding->charbuf;
4141 }
4142
4143 static void
4144 decode_coding_big5 (coding)
4145      struct coding_system *coding;
4146 {
4147   const unsigned char *src = coding->source + coding->consumed;
4148   const unsigned char *src_end = coding->source + coding->src_bytes;
4149   const unsigned char *src_base;
4150   int *charbuf = coding->charbuf + coding->charbuf_used;
4151   int *charbuf_end
4152     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4153   int consumed_chars = 0, consumed_chars_base;
4154   int multibytep = coding->src_multibyte;
4155   struct charset *charset_roman, *charset_big5;
4156   Lisp_Object attrs, charset_list, val;
4157   int char_offset = coding->produced_char;
4158   int last_offset = char_offset;
4159   int last_id = charset_ascii;
4160
4161   CODING_GET_INFO (coding, attrs, charset_list);
4162   val = charset_list;
4163   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4164   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4165
4166   while (1)
4167     {
4168       int c, c1;
4169       struct charset *charset;
4170
4171       src_base = src;
4172       consumed_chars_base = consumed_chars;
4173
4174       if (charbuf >= charbuf_end)
4175         break;
4176
4177       ONE_MORE_BYTE (c);
4178
4179       if (c < 0)
4180         goto invalid_code;
4181       if (c < 0x80)
4182         charset = charset_roman;
4183       else
4184         {
4185           /* BIG5 -> Big5 */
4186           if (c < 0xA1 || c > 0xFE)
4187             goto invalid_code;
4188           ONE_MORE_BYTE (c1);
4189           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4190             goto invalid_code;
4191           c = c << 8 | c1;
4192           charset = charset_big5;
4193         }
4194       if (charset->id != charset_ascii
4195           && last_id != charset->id)
4196         {
4197           if (last_id != charset_ascii)
4198             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4199           last_id = charset->id;
4200           last_offset = char_offset;
4201         }
4202       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4203       *charbuf++ = c;
4204       char_offset++;
4205       continue;
4206
4207     invalid_code:
4208       src = src_base;
4209       consumed_chars = consumed_chars_base;
4210       ONE_MORE_BYTE (c);
4211       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4212       char_offset++;
4213       coding->errors++;
4214     }
4215
4216  no_more_source:
4217   if (last_id != charset_ascii)
4218     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4219   coding->consumed_char += consumed_chars_base;
4220   coding->consumed = src_base - coding->source;
4221   coding->charbuf_used = charbuf - coding->charbuf;
4222 }
4223
4224 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4225    This function can encode charsets `ascii', `katakana-jisx0201',
4226    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4227    are sure that all these charsets are registered as official charset
4228    (i.e. do not have extended leading-codes).  Characters of other
4229    charsets are produced without any encoding.  If SJIS_P is 1, encode
4230    SJIS text, else encode BIG5 text.  */
4231
4232 static int
4233 encode_coding_sjis (coding)
4234      struct coding_system *coding;
4235 {
4236   int multibytep = coding->dst_multibyte;
4237   int *charbuf = coding->charbuf;
4238   int *charbuf_end = charbuf + coding->charbuf_used;
4239   unsigned char *dst = coding->destination + coding->produced;
4240   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4241   int safe_room = 4;
4242   int produced_chars = 0;
4243   Lisp_Object attrs, charset_list, val;
4244   int ascii_compatible;
4245   struct charset *charset_roman, *charset_kanji, *charset_kana;
4246   struct charset *charset_kanji2;
4247   int c;
4248
4249   CODING_GET_INFO (coding, attrs, charset_list);
4250   val = charset_list;
4251   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4252   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4253   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4254   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4255
4256   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4257
4258   while (charbuf < charbuf_end)
4259     {
4260       ASSURE_DESTINATION (safe_room);
4261       c = *charbuf++;
4262       /* Now encode the character C.  */
4263       if (ASCII_CHAR_P (c) && ascii_compatible)
4264         EMIT_ONE_ASCII_BYTE (c);
4265       else if (CHAR_BYTE8_P (c))
4266         {
4267           c = CHAR_TO_BYTE8 (c);
4268           EMIT_ONE_BYTE (c);
4269         }
4270       else
4271         {
4272           unsigned code;
4273           struct charset *charset = char_charset (c, charset_list, &code);
4274
4275           if (!charset)
4276             {
4277               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4278                 {
4279                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4280                   charset = CHARSET_FROM_ID (charset_ascii);
4281                 }
4282               else
4283                 {
4284                   c = coding->default_char;
4285                   charset = char_charset (c, charset_list, &code);
4286                 }
4287             }
4288           if (code == CHARSET_INVALID_CODE (charset))
4289             abort ();
4290           if (charset == charset_kanji)
4291             {
4292               int c1, c2;
4293               JIS_TO_SJIS (code);
4294               c1 = code >> 8, c2 = code & 0xFF;
4295               EMIT_TWO_BYTES (c1, c2);
4296             }
4297           else if (charset == charset_kana)
4298             EMIT_ONE_BYTE (code | 0x80);
4299           else if (charset_kanji2 && charset == charset_kanji2)
4300             {
4301               int c1, c2;
4302
4303               c1 = code >> 8;
4304               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4305                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4306                 {
4307                   JIS_TO_SJIS2 (code);
4308                   c1 = code >> 8, c2 = code & 0xFF;
4309                   EMIT_TWO_BYTES (c1, c2);
4310                 }
4311               else
4312                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4313             }
4314           else
4315             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4316         }
4317     }
4318   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4319   coding->produced_char += produced_chars;
4320   coding->produced = dst - coding->destination;
4321   return 0;
4322 }
4323
4324 static int
4325 encode_coding_big5 (coding)
4326      struct coding_system *coding;
4327 {
4328   int multibytep = coding->dst_multibyte;
4329   int *charbuf = coding->charbuf;
4330   int *charbuf_end = charbuf + coding->charbuf_used;
4331   unsigned char *dst = coding->destination + coding->produced;
4332   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4333   int safe_room = 4;
4334   int produced_chars = 0;
4335   Lisp_Object attrs, charset_list, val;
4336   int ascii_compatible;
4337   struct charset *charset_roman, *charset_big5;
4338   int c;
4339
4340   CODING_GET_INFO (coding, attrs, charset_list);
4341   val = charset_list;
4342   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4343   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4344   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4345
4346   while (charbuf < charbuf_end)
4347     {
4348       ASSURE_DESTINATION (safe_room);
4349       c = *charbuf++;
4350       /* Now encode the character C.  */
4351       if (ASCII_CHAR_P (c) && ascii_compatible)
4352         EMIT_ONE_ASCII_BYTE (c);
4353       else if (CHAR_BYTE8_P (c))
4354         {
4355           c = CHAR_TO_BYTE8 (c);
4356           EMIT_ONE_BYTE (c);
4357         }
4358       else
4359         {
4360           unsigned code;
4361           struct charset *charset = char_charset (c, charset_list, &code);
4362
4363           if (! charset)
4364             {
4365               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4366                 {
4367                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4368                   charset = CHARSET_FROM_ID (charset_ascii);
4369                 }
4370               else
4371                 {
4372                   c = coding->default_char;
4373                   charset = char_charset (c, charset_list, &code);
4374                 }
4375             }
4376           if (code == CHARSET_INVALID_CODE (charset))
4377             abort ();
4378           if (charset == charset_big5)
4379             {
4380               int c1, c2;
4381
4382               c1 = code >> 8, c2 = code & 0xFF;
4383               EMIT_TWO_BYTES (c1, c2);
4384             }
4385           else
4386             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4387         }
4388     }
4389   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4390   coding->produced_char += produced_chars;
4391   coding->produced = dst - coding->destination;
4392   return 0;
4393 }
4394
4395 \f
4396 /*** 10. CCL handlers ***/
4397
4398 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4399    Check if a text is encoded in a coding system of which
4400    encoder/decoder are written in CCL program.  If it is, return
4401    CATEGORY_MASK_CCL, else return 0.  */
4402
4403 static int
4404 detect_coding_ccl (coding, detect_info)
4405      struct coding_system *coding;
4406      struct coding_detection_info *detect_info;
4407 {
4408   const unsigned char *src = coding->source, *src_base;
4409   const unsigned char *src_end = coding->source + coding->src_bytes;
4410   int multibytep = coding->src_multibyte;
4411   int consumed_chars = 0;
4412   int found = 0;
4413   unsigned char *valids;
4414   int head_ascii = coding->head_ascii;
4415   Lisp_Object attrs;
4416
4417   detect_info->checked |= CATEGORY_MASK_CCL;
4418
4419   coding = &coding_categories[coding_category_ccl];
4420   valids = CODING_CCL_VALIDS (coding);
4421   attrs = CODING_ID_ATTRS (coding->id);
4422   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4423     src += head_ascii;
4424
4425   while (1)
4426     {
4427       int c;
4428
4429       src_base = src;
4430       ONE_MORE_BYTE (c);
4431       if (c < 0 || ! valids[c])
4432         break;
4433       if ((valids[c] > 1))
4434         found = CATEGORY_MASK_CCL;
4435     }
4436   detect_info->rejected |= CATEGORY_MASK_CCL;
4437   return 0;
4438
4439  no_more_source:
4440   detect_info->found |= found;
4441   return 1;
4442 }
4443
4444 static void
4445 decode_coding_ccl (coding)
4446      struct coding_system *coding;
4447 {
4448   const unsigned char *src = coding->source + coding->consumed;
4449   const unsigned char *src_end = coding->source + coding->src_bytes;
4450   int *charbuf = coding->charbuf + coding->charbuf_used;
4451   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4452   int consumed_chars = 0;
4453   int multibytep = coding->src_multibyte;
4454   struct ccl_program ccl;
4455   int source_charbuf[1024];
4456   int source_byteidx[1024];
4457   Lisp_Object attrs, charset_list;
4458
4459   CODING_GET_INFO (coding, attrs, charset_list);
4460   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4461
4462   while (src < src_end)
4463     {
4464       const unsigned char *p = src;
4465       int *source, *source_end;
4466       int i = 0;
4467
4468       if (multibytep)
4469         while (i < 1024 && p < src_end)
4470           {
4471             source_byteidx[i] = p - src;
4472             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4473           }
4474       else
4475         while (i < 1024 && p < src_end)
4476           source_charbuf[i++] = *p++;
4477
4478       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4479         ccl.last_block = 1;
4480
4481       source = source_charbuf;
4482       source_end = source + i;
4483       while (source < source_end)
4484         {
4485           ccl_driver (&ccl, source, charbuf,
4486                       source_end - source, charbuf_end - charbuf,
4487                       charset_list);
4488           source += ccl.consumed;
4489           charbuf += ccl.produced;
4490           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4491             break;
4492         }
4493       if (source < source_end)
4494         src += source_byteidx[source - source_charbuf];
4495       else
4496         src = p;
4497       consumed_chars += source - source_charbuf;
4498
4499       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4500           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4501         break;
4502     }
4503
4504   switch (ccl.status)
4505     {
4506     case CCL_STAT_SUSPEND_BY_SRC:
4507       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4508       break;
4509     case CCL_STAT_SUSPEND_BY_DST:
4510       break;
4511     case CCL_STAT_QUIT:
4512     case CCL_STAT_INVALID_CMD:
4513       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4514       break;
4515     default:
4516       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4517       break;
4518     }
4519   coding->consumed_char += consumed_chars;
4520   coding->consumed = src - coding->source;
4521   coding->charbuf_used = charbuf - coding->charbuf;
4522 }
4523
4524 static int
4525 encode_coding_ccl (coding)
4526      struct coding_system *coding;
4527 {
4528   struct ccl_program ccl;
4529   int multibytep = coding->dst_multibyte;
4530   int *charbuf = coding->charbuf;
4531   int *charbuf_end = charbuf + coding->charbuf_used;
4532   unsigned char *dst = coding->destination + coding->produced;
4533   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4534   unsigned char *adjusted_dst_end = dst_end - 1;
4535   int destination_charbuf[1024];
4536   int i, produced_chars = 0;
4537   Lisp_Object attrs, charset_list;
4538
4539   CODING_GET_INFO (coding, attrs, charset_list);
4540   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4541
4542   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4543   ccl.dst_multibyte = coding->dst_multibyte;
4544
4545   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4546     {
4547       int dst_bytes = dst_end - dst;
4548       if (dst_bytes > 1024)
4549         dst_bytes = 1024;
4550
4551       ccl_driver (&ccl, charbuf, destination_charbuf,
4552                   charbuf_end - charbuf, dst_bytes, charset_list);
4553       charbuf += ccl.consumed;
4554       if (multibytep)
4555         for (i = 0; i < ccl.produced; i++)
4556           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4557       else
4558         {
4559           for (i = 0; i < ccl.produced; i++)
4560             *dst++ = destination_charbuf[i] & 0xFF;
4561           produced_chars += ccl.produced;
4562         }
4563     }
4564
4565   switch (ccl.status)
4566     {
4567     case CCL_STAT_SUSPEND_BY_SRC:
4568       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4569       break;
4570     case CCL_STAT_SUSPEND_BY_DST:
4571       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4572       break;
4573     case CCL_STAT_QUIT:
4574     case CCL_STAT_INVALID_CMD:
4575       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4576       break;
4577     default:
4578       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4579       break;
4580     }
4581
4582   coding->produced_char += produced_chars;
4583   coding->produced = dst - coding->destination;
4584   return 0;
4585 }
4586
4587
4588 \f
4589 /*** 10, 11. no-conversion handlers ***/
4590
4591 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4592
4593 static void
4594 decode_coding_raw_text (coding)
4595      struct coding_system *coding;
4596 {
4597   coding->chars_at_source = 1;
4598   coding->consumed_char = 0;
4599   coding->consumed = 0;
4600   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4601 }
4602
4603 static int
4604 encode_coding_raw_text (coding)
4605      struct coding_system *coding;
4606 {
4607   int multibytep = coding->dst_multibyte;
4608   int *charbuf = coding->charbuf;
4609   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4610   unsigned char *dst = coding->destination + coding->produced;
4611   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4612   int produced_chars = 0;
4613   int c;
4614
4615   if (multibytep)
4616     {
4617       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4618
4619       if (coding->src_multibyte)
4620         while (charbuf < charbuf_end)
4621           {
4622             ASSURE_DESTINATION (safe_room);
4623             c = *charbuf++;
4624             if (ASCII_CHAR_P (c))
4625               EMIT_ONE_ASCII_BYTE (c);
4626             else if (CHAR_BYTE8_P (c))
4627               {
4628                 c = CHAR_TO_BYTE8 (c);
4629                 EMIT_ONE_BYTE (c);
4630               }
4631             else
4632               {
4633                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4634
4635                 CHAR_STRING_ADVANCE (c, p1);
4636                 while (p0 < p1)
4637                   {
4638                     EMIT_ONE_BYTE (*p0);
4639                     p0++;
4640                   }
4641               }
4642           }
4643       else
4644         while (charbuf < charbuf_end)
4645           {
4646             ASSURE_DESTINATION (safe_room);
4647             c = *charbuf++;
4648             EMIT_ONE_BYTE (c);
4649           }
4650     }
4651   else
4652     {
4653       if (coding->src_multibyte)
4654         {
4655           int safe_room = MAX_MULTIBYTE_LENGTH;
4656
4657           while (charbuf < charbuf_end)
4658             {
4659               ASSURE_DESTINATION (safe_room);
4660               c = *charbuf++;
4661               if (ASCII_CHAR_P (c))
4662                 *dst++ = c;
4663               else if (CHAR_BYTE8_P (c))
4664                 *dst++ = CHAR_TO_BYTE8 (c);
4665               else
4666                 CHAR_STRING_ADVANCE (c, dst);
4667               produced_chars++;
4668             }
4669         }
4670       else
4671         {
4672           ASSURE_DESTINATION (charbuf_end - charbuf);
4673           while (charbuf < charbuf_end && dst < dst_end)
4674             *dst++ = *charbuf++;
4675           produced_chars = dst - (coding->destination + coding->dst_bytes);
4676         }
4677     }
4678   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4679   coding->produced_char += produced_chars;
4680   coding->produced = dst - coding->destination;
4681   return 0;
4682 }
4683
4684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4685    Check if a text is encoded in a charset-based coding system.  If it
4686    is, return 1, else return 0.  */
4687
4688 static int
4689 detect_coding_charset (coding, detect_info)
4690      struct coding_system *coding;
4691      struct coding_detection_info *detect_info;
4692 {
4693   const unsigned char *src = coding->source, *src_base;
4694   const unsigned char *src_end = coding->source + coding->src_bytes;
4695   int multibytep = coding->src_multibyte;
4696   int consumed_chars = 0;
4697   Lisp_Object attrs, valids;
4698   int found = 0;
4699
4700   detect_info->checked |= CATEGORY_MASK_CHARSET;
4701
4702   coding = &coding_categories[coding_category_charset];
4703   attrs = CODING_ID_ATTRS (coding->id);
4704   valids = AREF (attrs, coding_attr_charset_valids);
4705
4706   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4707     src += coding->head_ascii;
4708
4709   while (1)
4710     {
4711       int c;
4712
4713       src_base = src;
4714       ONE_MORE_BYTE (c);
4715       if (c < 0)
4716         continue;
4717       if (NILP (AREF (valids, c)))
4718         break;
4719       if (c >= 0x80)
4720         found = CATEGORY_MASK_CHARSET;
4721     }
4722   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4723   return 0;
4724
4725  no_more_source:
4726   detect_info->found |= found;
4727   return 1;
4728 }
4729
4730 static void
4731 decode_coding_charset (coding)
4732      struct coding_system *coding;
4733 {
4734   const unsigned char *src = coding->source + coding->consumed;
4735   const unsigned char *src_end = coding->source + coding->src_bytes;
4736   const unsigned char *src_base;
4737   int *charbuf = coding->charbuf + coding->charbuf_used;
4738   int *charbuf_end
4739     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4740   int consumed_chars = 0, consumed_chars_base;
4741   int multibytep = coding->src_multibyte;
4742   Lisp_Object attrs, charset_list, valids;
4743   int char_offset = coding->produced_char;
4744   int last_offset = char_offset;
4745   int last_id = charset_ascii;
4746
4747   CODING_GET_INFO (coding, attrs, charset_list);
4748   valids = AREF (attrs, coding_attr_charset_valids);
4749
4750   while (1)
4751     {
4752       int c;
4753       Lisp_Object val;
4754       struct charset *charset;
4755       int dim;
4756       int len = 1;
4757       unsigned code;
4758
4759       src_base = src;
4760       consumed_chars_base = consumed_chars;
4761
4762       if (charbuf >= charbuf_end)
4763         break;
4764
4765       ONE_MORE_BYTE (c);
4766       if (c < 0)
4767         goto invalid_code;
4768       code = c;
4769
4770       val = AREF (valids, c);
4771       if (NILP (val))
4772         goto invalid_code;
4773       if (INTEGERP (val))
4774         {
4775           charset = CHARSET_FROM_ID (XFASTINT (val));
4776           dim = CHARSET_DIMENSION (charset);
4777           while (len < dim)
4778             {
4779               ONE_MORE_BYTE (c);
4780               code = (code << 8) | c;
4781               len++;
4782             }
4783           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4784                               charset, code, c);
4785         }
4786       else
4787         {
4788           /* VAL is a list of charset IDs.  It is assured that the
4789              list is sorted by charset dimensions (smaller one
4790              comes first).  */
4791           while (CONSP (val))
4792             {
4793               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4794               dim = CHARSET_DIMENSION (charset);
4795               while (len < dim)
4796                 {
4797                   ONE_MORE_BYTE (c);
4798                   code = (code << 8) | c;
4799                   len++;
4800                 }
4801               CODING_DECODE_CHAR (coding, src, src_base,
4802                                   src_end, charset, code, c);
4803               if (c >= 0)
4804                 break;
4805               val = XCDR (val);
4806             }
4807         }
4808       if (c < 0)
4809         goto invalid_code;
4810       if (charset->id != charset_ascii
4811           && last_id != charset->id)
4812         {
4813           if (last_id != charset_ascii)
4814             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4815           last_id = charset->id;
4816           last_offset = char_offset;
4817         }
4818
4819       *charbuf++ = c;
4820       char_offset++;
4821       continue;
4822
4823     invalid_code:
4824       src = src_base;
4825       consumed_chars = consumed_chars_base;
4826       ONE_MORE_BYTE (c);
4827       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4828       char_offset++;
4829       coding->errors++;
4830     }
4831
4832  no_more_source:
4833   if (last_id != charset_ascii)
4834     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4835   coding->consumed_char += consumed_chars_base;
4836   coding->consumed = src_base - coding->source;
4837   coding->charbuf_used = charbuf - coding->charbuf;
4838 }
4839
4840 static int
4841 encode_coding_charset (coding)
4842      struct coding_system *coding;
4843 {
4844   int multibytep = coding->dst_multibyte;
4845   int *charbuf = coding->charbuf;
4846   int *charbuf_end = charbuf + coding->charbuf_used;
4847   unsigned char *dst = coding->destination + coding->produced;
4848   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4849   int safe_room = MAX_MULTIBYTE_LENGTH;
4850   int produced_chars = 0;
4851   Lisp_Object attrs, charset_list;
4852   int ascii_compatible;
4853   int c;
4854
4855   CODING_GET_INFO (coding, attrs, charset_list);
4856   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4857
4858   while (charbuf < charbuf_end)
4859     {
4860       struct charset *charset;
4861       unsigned code;
4862
4863       ASSURE_DESTINATION (safe_room);
4864       c = *charbuf++;
4865       if (ascii_compatible && ASCII_CHAR_P (c))
4866         EMIT_ONE_ASCII_BYTE (c);
4867       else if (CHAR_BYTE8_P (c))
4868         {
4869           c = CHAR_TO_BYTE8 (c);
4870           EMIT_ONE_BYTE (c);
4871         }
4872       else
4873         {
4874           charset = char_charset (c, charset_list, &code);
4875           if (charset)
4876             {
4877               if (CHARSET_DIMENSION (charset) == 1)
4878                 EMIT_ONE_BYTE (code);
4879               else if (CHARSET_DIMENSION (charset) == 2)
4880                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4881               else if (CHARSET_DIMENSION (charset) == 3)
4882                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4883               else
4884                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4885                                  (code >> 8) & 0xFF, code & 0xFF);
4886             }
4887           else
4888             {
4889               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4890                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4891               else
4892                 c = coding->default_char;
4893               EMIT_ONE_BYTE (c);
4894             }
4895         }
4896     }
4897
4898   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4899   coding->produced_char += produced_chars;
4900   coding->produced = dst - coding->destination;
4901   return 0;
4902 }
4903
4904 \f
4905 /*** 7. C library functions ***/
4906
4907 /* Setup coding context CODING from information about CODING_SYSTEM.
4908    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4909    CODING_SYSTEM is invalid, signal an error.  */
4910
4911 void
4912 setup_coding_system (coding_system, coding)
4913      Lisp_Object coding_system;
4914      struct coding_system *coding;
4915 {
4916   Lisp_Object attrs;
4917   Lisp_Object eol_type;
4918   Lisp_Object coding_type;
4919   Lisp_Object val;
4920
4921   if (NILP (coding_system))
4922     coding_system = Qundecided;
4923
4924   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4925
4926   attrs = CODING_ID_ATTRS (coding->id);
4927   eol_type = CODING_ID_EOL_TYPE (coding->id);
4928
4929   coding->mode = 0;
4930   coding->head_ascii = -1;
4931   coding->common_flags
4932     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4933   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4934     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4935   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4936     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4937   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4938     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4939
4940   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4941   coding->max_charset_id = SCHARS (val) - 1;
4942   coding->safe_charsets = (char *) SDATA (val);
4943   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4944
4945   coding_type = CODING_ATTR_TYPE (attrs);
4946   if (EQ (coding_type, Qundecided))
4947     {
4948       coding->detector = NULL;
4949       coding->decoder = decode_coding_raw_text;
4950       coding->encoder = encode_coding_raw_text;
4951       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4952     }
4953   else if (EQ (coding_type, Qiso_2022))
4954     {
4955       int i;
4956       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4957
4958       /* Invoke graphic register 0 to plane 0.  */
4959       CODING_ISO_INVOCATION (coding, 0) = 0;
4960       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4961       CODING_ISO_INVOCATION (coding, 1)
4962         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4963       /* Setup the initial status of designation.  */
4964       for (i = 0; i < 4; i++)
4965         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4966       /* Not single shifting initially.  */
4967       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4968       /* Beginning of buffer should also be regarded as bol. */
4969       CODING_ISO_BOL (coding) = 1;
4970       coding->detector = detect_coding_iso_2022;
4971       coding->decoder = decode_coding_iso_2022;
4972       coding->encoder = encode_coding_iso_2022;
4973       if (flags & CODING_ISO_FLAG_SAFE)
4974         coding->mode |= CODING_MODE_SAFE_ENCODING;
4975       coding->common_flags
4976         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4977             | CODING_REQUIRE_FLUSHING_MASK);
4978       if (flags & CODING_ISO_FLAG_COMPOSITION)
4979         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4980       if (flags & CODING_ISO_FLAG_DESIGNATION)
4981         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4982       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4983         {
4984           setup_iso_safe_charsets (attrs);
4985           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4986           coding->max_charset_id = SCHARS (val) - 1;
4987           coding->safe_charsets = (char *) SDATA (val);
4988         }
4989       CODING_ISO_FLAGS (coding) = flags;
4990     }
4991   else if (EQ (coding_type, Qcharset))
4992     {
4993       coding->detector = detect_coding_charset;
4994       coding->decoder = decode_coding_charset;
4995       coding->encoder = encode_coding_charset;
4996       coding->common_flags
4997         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4998     }
4999   else if (EQ (coding_type, Qutf_8))
5000     {
5001       coding->detector = detect_coding_utf_8;
5002       coding->decoder = decode_coding_utf_8;
5003       coding->encoder = encode_coding_utf_8;
5004       coding->common_flags
5005         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5006     }
5007   else if (EQ (coding_type, Qutf_16))
5008     {
5009       val = AREF (attrs, coding_attr_utf_16_bom);
5010       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5011                                     : EQ (val, Qt) ? utf_16_with_bom
5012                                     : utf_16_without_bom);
5013       val = AREF (attrs, coding_attr_utf_16_endian);
5014       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5015                                        : utf_16_little_endian);
5016       CODING_UTF_16_SURROGATE (coding) = 0;
5017       coding->detector = detect_coding_utf_16;
5018       coding->decoder = decode_coding_utf_16;
5019       coding->encoder = encode_coding_utf_16;
5020       coding->common_flags
5021         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5022       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5023         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5024     }
5025   else if (EQ (coding_type, Qccl))
5026     {
5027       coding->detector = detect_coding_ccl;
5028       coding->decoder = decode_coding_ccl;
5029       coding->encoder = encode_coding_ccl;
5030       coding->common_flags
5031         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5032             | CODING_REQUIRE_FLUSHING_MASK);
5033     }
5034   else if (EQ (coding_type, Qemacs_mule))
5035     {
5036       coding->detector = detect_coding_emacs_mule;
5037       coding->decoder = decode_coding_emacs_mule;
5038       coding->encoder = encode_coding_emacs_mule;
5039       coding->common_flags
5040         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5041       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5042           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5043         {
5044           Lisp_Object tail, safe_charsets;
5045           int max_charset_id = 0;
5046
5047           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5048                tail = XCDR (tail))
5049             if (max_charset_id < XFASTINT (XCAR (tail)))
5050               max_charset_id = XFASTINT (XCAR (tail));
5051           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5052                                         make_number (255));
5053           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5054                tail = XCDR (tail))
5055             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5056           coding->max_charset_id = max_charset_id;
5057           coding->safe_charsets = (char *) SDATA (safe_charsets);
5058         }
5059     }
5060   else if (EQ (coding_type, Qshift_jis))
5061     {
5062       coding->detector = detect_coding_sjis;
5063       coding->decoder = decode_coding_sjis;
5064       coding->encoder = encode_coding_sjis;
5065       coding->common_flags
5066         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5067     }
5068   else if (EQ (coding_type, Qbig5))
5069     {
5070       coding->detector = detect_coding_big5;
5071       coding->decoder = decode_coding_big5;
5072       coding->encoder = encode_coding_big5;
5073       coding->common_flags
5074         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5075     }
5076   else                          /* EQ (coding_type, Qraw_text) */
5077     {
5078       coding->detector = NULL;
5079       coding->decoder = decode_coding_raw_text;
5080       coding->encoder = encode_coding_raw_text;
5081       if (! EQ (eol_type, Qunix))
5082         {
5083           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5084           if (! VECTORP (eol_type))
5085             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5086         }
5087
5088     }
5089
5090   return;
5091 }
5092
5093 /* Return a list of charsets supported by CODING.  */
5094
5095 Lisp_Object
5096 coding_charset_list (coding)
5097      struct coding_system *coding;
5098 {
5099   Lisp_Object attrs, charset_list;
5100
5101   CODING_GET_INFO (coding, attrs, charset_list);
5102   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5103     {
5104       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5105
5106       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5107         charset_list = Viso_2022_charset_list;
5108     }
5109   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5110     {
5111       charset_list = Vemacs_mule_charset_list;
5112     }
5113   return charset_list;
5114 }
5115
5116
5117 /* Return raw-text or one of its subsidiaries that has the same
5118    eol_type as CODING-SYSTEM.  */
5119
5120 Lisp_Object
5121 raw_text_coding_system (coding_system)
5122      Lisp_Object coding_system;
5123 {
5124   Lisp_Object spec, attrs;
5125   Lisp_Object eol_type, raw_text_eol_type;
5126
5127   if (NILP (coding_system))
5128     return Qraw_text;
5129   spec = CODING_SYSTEM_SPEC (coding_system);
5130   attrs = AREF (spec, 0);
5131
5132   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5133     return coding_system;
5134
5135   eol_type = AREF (spec, 2);
5136   if (VECTORP (eol_type))
5137     return Qraw_text;
5138   spec = CODING_SYSTEM_SPEC (Qraw_text);
5139   raw_text_eol_type = AREF (spec, 2);
5140   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5141           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5142           : AREF (raw_text_eol_type, 2));
5143 }
5144
5145
5146 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5147    does, return one of the subsidiary that has the same eol-spec as
5148    PARENT.  Otherwise, return CODING_SYSTEM.  */
5149
5150 Lisp_Object
5151 coding_inherit_eol_type (coding_system, parent)
5152      Lisp_Object coding_system, parent;
5153 {
5154   Lisp_Object spec, eol_type;
5155
5156   if (NILP (coding_system))
5157     coding_system = Qraw_text;
5158   spec = CODING_SYSTEM_SPEC (coding_system);
5159   eol_type = AREF (spec, 2);
5160   if (VECTORP (eol_type)
5161       && ! NILP (parent))
5162     {
5163       Lisp_Object parent_spec;
5164       Lisp_Object parent_eol_type;
5165
5166       parent_spec
5167         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5168       parent_eol_type = AREF (parent_spec, 2);
5169       if (EQ (parent_eol_type, Qunix))
5170         coding_system = AREF (eol_type, 0);
5171       else if (EQ (parent_eol_type, Qdos))
5172         coding_system = AREF (eol_type, 1);
5173       else if (EQ (parent_eol_type, Qmac))
5174         coding_system = AREF (eol_type, 2);
5175     }
5176   return coding_system;
5177 }
5178
5179 /* Emacs has a mechanism to automatically detect a coding system if it
5180    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5181    it's impossible to distinguish some coding systems accurately
5182    because they use the same range of codes.  So, at first, coding
5183    systems are categorized into 7, those are:
5184
5185    o coding-category-emacs-mule
5186
5187         The category for a coding system which has the same code range
5188         as Emacs' internal format.  Assigned the coding-system (Lisp
5189         symbol) `emacs-mule' by default.
5190
5191    o coding-category-sjis
5192
5193         The category for a coding system which has the same code range
5194         as SJIS.  Assigned the coding-system (Lisp
5195         symbol) `japanese-shift-jis' by default.
5196
5197    o coding-category-iso-7
5198
5199         The category for a coding system which has the same code range
5200         as ISO2022 of 7-bit environment.  This doesn't use any locking
5201         shift and single shift functions.  This can encode/decode all
5202         charsets.  Assigned the coding-system (Lisp symbol)
5203         `iso-2022-7bit' by default.
5204
5205    o coding-category-iso-7-tight
5206
5207         Same as coding-category-iso-7 except that this can
5208         encode/decode only the specified charsets.
5209
5210    o coding-category-iso-8-1
5211
5212         The category for a coding system which has the same code range
5213         as ISO2022 of 8-bit environment and graphic plane 1 used only
5214         for DIMENSION1 charset.  This doesn't use any locking shift
5215         and single shift functions.  Assigned the coding-system (Lisp
5216         symbol) `iso-latin-1' by default.
5217
5218    o coding-category-iso-8-2
5219
5220         The category for a coding system which has the same code range
5221         as ISO2022 of 8-bit environment and graphic plane 1 used only
5222         for DIMENSION2 charset.  This doesn't use any locking shift
5223         and single shift functions.  Assigned the coding-system (Lisp
5224         symbol) `japanese-iso-8bit' by default.
5225
5226    o coding-category-iso-7-else
5227
5228         The category for a coding system which has the same code range
5229         as ISO2022 of 7-bit environemnt but uses locking shift or
5230         single shift functions.  Assigned the coding-system (Lisp
5231         symbol) `iso-2022-7bit-lock' by default.
5232
5233    o coding-category-iso-8-else
5234
5235         The category for a coding system which has the same code range
5236         as ISO2022 of 8-bit environemnt but uses locking shift or
5237         single shift functions.  Assigned the coding-system (Lisp
5238         symbol) `iso-2022-8bit-ss2' by default.
5239
5240    o coding-category-big5
5241
5242         The category for a coding system which has the same code range
5243         as BIG5.  Assigned the coding-system (Lisp symbol)
5244         `cn-big5' by default.
5245
5246    o coding-category-utf-8
5247
5248         The category for a coding system which has the same code range
5249         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5250         symbol) `utf-8' by default.
5251
5252    o coding-category-utf-16-be
5253
5254         The category for a coding system in which a text has an
5255         Unicode signature (cf. Unicode Standard) in the order of BIG
5256         endian at the head.  Assigned the coding-system (Lisp symbol)
5257         `utf-16-be' by default.
5258
5259    o coding-category-utf-16-le
5260
5261         The category for a coding system in which a text has an
5262         Unicode signature (cf. Unicode Standard) in the order of
5263         LITTLE endian at the head.  Assigned the coding-system (Lisp
5264         symbol) `utf-16-le' by default.
5265
5266    o coding-category-ccl
5267
5268         The category for a coding system of which encoder/decoder is
5269         written in CCL programs.  The default value is nil, i.e., no
5270         coding system is assigned.
5271
5272    o coding-category-binary
5273
5274         The category for a coding system not categorized in any of the
5275         above.  Assigned the coding-system (Lisp symbol)
5276         `no-conversion' by default.
5277
5278    Each of them is a Lisp symbol and the value is an actual
5279    `coding-system's (this is also a Lisp symbol) assigned by a user.
5280    What Emacs does actually is to detect a category of coding system.
5281    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5282    decide only one possible category, it selects a category of the
5283    highest priority.  Priorities of categories are also specified by a
5284    user in a Lisp variable `coding-category-list'.
5285
5286 */
5287
5288 #define EOL_SEEN_NONE   0
5289 #define EOL_SEEN_LF     1
5290 #define EOL_SEEN_CR     2
5291 #define EOL_SEEN_CRLF   4
5292
5293 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5294    SOURCE is encoded.  If CATEGORY is one of
5295    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5296    two-byte, else they are encoded by one-byte.
5297
5298    Return one of EOL_SEEN_XXX.  */
5299
5300 #define MAX_EOL_CHECK_COUNT 3
5301
5302 static int
5303 detect_eol (source, src_bytes, category)
5304      const unsigned char *source;
5305      EMACS_INT src_bytes;
5306      enum coding_category category;
5307 {
5308   const unsigned char *src = source, *src_end = src + src_bytes;
5309   unsigned char c;
5310   int total  = 0;
5311   int eol_seen = EOL_SEEN_NONE;
5312
5313   if ((1 << category) & CATEGORY_MASK_UTF_16)
5314     {
5315       int msb, lsb;
5316
5317       msb = category == (coding_category_utf_16_le
5318                          | coding_category_utf_16_le_nosig);
5319       lsb = 1 - msb;
5320
5321       while (src + 1 < src_end)
5322         {
5323           c = src[lsb];
5324           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5325             {
5326               int this_eol;
5327
5328               if (c == '\n')
5329                 this_eol = EOL_SEEN_LF;
5330               else if (src + 3 >= src_end
5331                        || src[msb + 2] != 0
5332                        || src[lsb + 2] != '\n')
5333                 this_eol = EOL_SEEN_CR;
5334               else
5335                 this_eol = EOL_SEEN_CRLF;
5336
5337               if (eol_seen == EOL_SEEN_NONE)
5338                 /* This is the first end-of-line.  */
5339                 eol_seen = this_eol;
5340               else if (eol_seen != this_eol)
5341                 {
5342                   /* The found type is different from what found before.  */
5343                   eol_seen = EOL_SEEN_LF;
5344                   break;
5345                 }
5346               if (++total == MAX_EOL_CHECK_COUNT)
5347                 break;
5348             }
5349           src += 2;
5350         }
5351     }
5352   else
5353     {
5354       while (src < src_end)
5355         {
5356           c = *src++;
5357           if (c == '\n' || c == '\r')
5358             {
5359               int this_eol;
5360
5361               if (c == '\n')
5362                 this_eol = EOL_SEEN_LF;
5363               else if (src >= src_end || *src != '\n')
5364                 this_eol = EOL_SEEN_CR;
5365               else
5366                 this_eol = EOL_SEEN_CRLF, src++;
5367
5368               if (eol_seen == EOL_SEEN_NONE)
5369                 /* This is the first end-of-line.  */
5370                 eol_seen = this_eol;
5371               else if (eol_seen != this_eol)
5372                 {
5373                   /* The found type is different from what found before.  */
5374                   eol_seen = EOL_SEEN_LF;
5375                   break;
5376                 }
5377               if (++total == MAX_EOL_CHECK_COUNT)
5378                 break;
5379             }
5380         }
5381     }
5382   return eol_seen;
5383 }
5384
5385
5386 static Lisp_Object
5387 adjust_coding_eol_type (coding, eol_seen)
5388      struct coding_system *coding;
5389      int eol_seen;
5390 {
5391   Lisp_Object eol_type;
5392
5393   eol_type = CODING_ID_EOL_TYPE (coding->id);
5394   if (eol_seen & EOL_SEEN_LF)
5395     {
5396       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5397       eol_type = Qunix;
5398     }
5399   else if (eol_seen & EOL_SEEN_CRLF)
5400     {
5401       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5402       eol_type = Qdos;
5403     }
5404   else if (eol_seen & EOL_SEEN_CR)
5405     {
5406       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5407       eol_type = Qmac;
5408     }
5409   return eol_type;
5410 }
5411
5412 /* Detect how a text specified in CODING is encoded.  If a coding
5413    system is detected, update fields of CODING by the detected coding
5414    system.  */
5415
5416 void
5417 detect_coding (coding)
5418      struct coding_system *coding;
5419 {
5420   const unsigned char *src, *src_end;
5421
5422   coding->consumed = coding->consumed_char = 0;
5423   coding->produced = coding->produced_char = 0;
5424   coding_set_source (coding);
5425
5426   src_end = coding->source + coding->src_bytes;
5427
5428   /* If we have not yet decided the text encoding type, detect it
5429      now.  */
5430   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5431     {
5432       int c, i;
5433       struct coding_detection_info detect_info;
5434
5435       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5436       for (i = 0, src = coding->source; src < src_end; i++, src++)
5437         {
5438           c = *src;
5439           if (c & 0x80)
5440             break;
5441           if (c < 0x20
5442               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5443               && ! inhibit_iso_escape_detection
5444               && ! detect_info.checked)
5445             {
5446               coding->head_ascii = src - (coding->source + coding->consumed);
5447               if (detect_coding_iso_2022 (coding, &detect_info))
5448                 {
5449                   /* We have scanned the whole data.  */
5450                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5451                     /* We didn't find an 8-bit code.  */
5452                     src = src_end;
5453                   break;
5454                 }
5455             }
5456         }
5457       coding->head_ascii = src - (coding->source + coding->consumed);
5458
5459       if (coding->head_ascii < coding->src_bytes
5460           || detect_info.found)
5461         {
5462           enum coding_category category;
5463           struct coding_system *this;
5464
5465           if (coding->head_ascii == coding->src_bytes)
5466             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5467             for (i = 0; i < coding_category_raw_text; i++)
5468               {
5469                 category = coding_priorities[i];
5470                 this = coding_categories + category;
5471                 if (detect_info.found & (1 << category))
5472                   break;
5473               }
5474           else
5475             for (i = 0; i < coding_category_raw_text; i++)
5476               {
5477                 category = coding_priorities[i];
5478                 this = coding_categories + category;
5479                 if (this->id < 0)
5480                   {
5481                     /* No coding system of this category is defined.  */
5482                     detect_info.rejected |= (1 << category);
5483                   }
5484                 else if (category >= coding_category_raw_text)
5485                   continue;
5486                 else if (detect_info.checked & (1 << category))
5487                   {
5488                     if (detect_info.found & (1 << category))
5489                       break;
5490                   }
5491                 else if ((*(this->detector)) (coding, &detect_info)
5492                          && detect_info.found & (1 << category))
5493                   {
5494                     if (category == coding_category_utf_16_auto)
5495                       {
5496                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5497                           category = coding_category_utf_16_le;
5498                         else
5499                           category = coding_category_utf_16_be;
5500                       }
5501                     break;
5502                   }
5503               }
5504
5505           if (i < coding_category_raw_text)
5506             setup_coding_system (CODING_ID_NAME (this->id), coding);
5507           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5508             setup_coding_system (Qraw_text, coding);
5509           else if (detect_info.rejected)
5510             for (i = 0; i < coding_category_raw_text; i++)
5511               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5512                 {
5513                   this = coding_categories + coding_priorities[i];
5514                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5515                   break;
5516                 }
5517         }
5518     }
5519   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5520            == coding_category_utf_16_auto)
5521     {
5522       Lisp_Object coding_systems;
5523       struct coding_detection_info detect_info;
5524
5525       coding_systems
5526         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5527       detect_info.found = detect_info.rejected = 0;
5528       if (CONSP (coding_systems)
5529           && detect_coding_utf_16 (coding, &detect_info))
5530         {
5531           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5532             setup_coding_system (XCAR (coding_systems), coding);
5533           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5534             setup_coding_system (XCDR (coding_systems), coding);
5535         }
5536     }
5537 }
5538
5539
5540 static void
5541 decode_eol (coding)
5542      struct coding_system *coding;
5543 {
5544   Lisp_Object eol_type;
5545   unsigned char *p, *pbeg, *pend;
5546
5547   eol_type = CODING_ID_EOL_TYPE (coding->id);
5548   if (EQ (eol_type, Qunix))
5549     return;
5550
5551   if (NILP (coding->dst_object))
5552     pbeg = coding->destination;
5553   else
5554     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5555   pend = pbeg + coding->produced;
5556
5557   if (VECTORP (eol_type))
5558     {
5559       int eol_seen = EOL_SEEN_NONE;
5560
5561       for (p = pbeg; p < pend; p++)
5562         {
5563           if (*p == '\n')
5564             eol_seen |= EOL_SEEN_LF;
5565           else if (*p == '\r')
5566             {
5567               if (p + 1 < pend && *(p + 1) == '\n')
5568                 {
5569                   eol_seen |= EOL_SEEN_CRLF;
5570                   p++;
5571                 }
5572               else
5573                 eol_seen |= EOL_SEEN_CR;
5574             }
5575         }
5576       if (eol_seen != EOL_SEEN_NONE
5577           && eol_seen != EOL_SEEN_LF
5578           && eol_seen != EOL_SEEN_CRLF
5579           && eol_seen != EOL_SEEN_CR)
5580         eol_seen = EOL_SEEN_LF;
5581       if (eol_seen != EOL_SEEN_NONE)
5582         eol_type = adjust_coding_eol_type (coding, eol_seen);
5583     }
5584
5585   if (EQ (eol_type, Qmac))
5586     {
5587       for (p = pbeg; p < pend; p++)
5588         if (*p == '\r')
5589           *p = '\n';
5590     }
5591   else if (EQ (eol_type, Qdos))
5592     {
5593       int n = 0;
5594
5595       if (NILP (coding->dst_object))
5596         {
5597           for (p = pend - 2; p >= pbeg; p--)
5598             if (*p == '\r')
5599               {
5600                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5601                 n++;
5602               }
5603         }
5604       else
5605         {
5606           for (p = pend - 2; p >= pbeg; p--)
5607             if (*p == '\r')
5608               {
5609                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5610                 int pos = BYTE_TO_CHAR (pos_byte);
5611
5612                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5613                 n++;
5614               }
5615         }
5616       coding->produced -= n;
5617       coding->produced_char -= n;
5618     }
5619 }
5620
5621
5622 /* Return a translation table (or list of them) from coding system
5623    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5624    decoding (ENCODEP is zero). */
5625
5626 static Lisp_Object
5627 get_translation_table (attrs, encodep, max_lookup)
5628      Lisp_Object attrs;
5629      int encodep, *max_lookup;
5630 {
5631   Lisp_Object standard, translation_table;
5632   Lisp_Object val;
5633
5634   if (encodep)
5635     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5636       standard = Vstandard_translation_table_for_encode;
5637   else
5638     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5639       standard = Vstandard_translation_table_for_decode;
5640   if (NILP (translation_table))
5641     translation_table = standard;
5642   else
5643     {
5644       if (SYMBOLP (translation_table))
5645         translation_table = Fget (translation_table, Qtranslation_table);
5646       else if (CONSP (translation_table))
5647         {
5648           translation_table = Fcopy_sequence (translation_table);
5649           for (val = translation_table; CONSP (val); val = XCDR (val))
5650             if (SYMBOLP (XCAR (val)))
5651               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5652         }
5653       if (CHAR_TABLE_P (standard))
5654         {
5655           if (CONSP (translation_table))
5656             translation_table = nconc2 (translation_table,
5657                                         Fcons (standard, Qnil));
5658           else
5659             translation_table = Fcons (translation_table,
5660                                        Fcons (standard, Qnil));
5661         }
5662     }
5663
5664   if (max_lookup)
5665     {
5666       *max_lookup = 1;
5667       if (CHAR_TABLE_P (translation_table)
5668           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5669         {
5670           val = XCHAR_TABLE (translation_table)->extras[1];
5671           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5672             *max_lookup = XFASTINT (val);
5673         }
5674       else if (CONSP (translation_table))
5675         {
5676           Lisp_Object tail, val;
5677
5678           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5679             if (CHAR_TABLE_P (XCAR (tail))
5680                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5681               {
5682                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5683                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5684                   *max_lookup = XFASTINT (val);
5685               }
5686         }
5687     }
5688   return translation_table;
5689 }
5690
5691 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5692   do {                                                          \
5693     trans = Qnil;                                               \
5694     if (CHAR_TABLE_P (table))                                   \
5695       {                                                         \
5696         trans = CHAR_TABLE_REF (table, c);                      \
5697         if (CHARACTERP (trans))                                 \
5698           c = XFASTINT (trans), trans = Qnil;                   \
5699       }                                                         \
5700     else if (CONSP (table))                                     \
5701       {                                                         \
5702         Lisp_Object tail;                                       \
5703                                                                 \
5704         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5705           if (CHAR_TABLE_P (XCAR (tail)))                       \
5706             {                                                   \
5707               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5708               if (CHARACTERP (trans))                           \
5709                 c = XFASTINT (trans), trans = Qnil;             \
5710               else if (! NILP (trans))                          \
5711                 break;                                          \
5712             }                                                   \
5713       }                                                         \
5714   } while (0)
5715
5716
5717 static Lisp_Object
5718 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5719      Lisp_Object val;
5720      int *buf, *buf_end;
5721      int last_block;
5722      int *from_nchars, *to_nchars;
5723 {
5724   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5725      [TO-CHAR ...].  */
5726   if (CONSP (val))
5727     {
5728       Lisp_Object from, tail;
5729       int i, len;
5730
5731       for (tail = val; CONSP (tail); tail = XCDR (tail))
5732         {
5733           val = XCAR (tail);
5734           from = XCAR (val);
5735           len = ASIZE (from);
5736           for (i = 0; i < len; i++)
5737             {
5738               if (buf + i == buf_end)
5739                 {
5740                   if (! last_block)
5741                     return Qt;
5742                   break;
5743                 }
5744               if (XINT (AREF (from, i)) != buf[i])
5745                 break;
5746             }
5747           if (i == len)
5748             {
5749               val = XCDR (val);
5750               *from_nchars = len;
5751               break;
5752             }
5753         }
5754       if (! CONSP (tail))
5755         return Qnil;
5756     }
5757   if (VECTORP (val))
5758     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5759   else
5760     *buf = XINT (val);
5761   return val;
5762 }
5763
5764
5765 static int
5766 produce_chars (coding, translation_table, last_block)
5767      struct coding_system *coding;
5768      Lisp_Object translation_table;
5769      int last_block;
5770 {
5771   unsigned char *dst = coding->destination + coding->produced;
5772   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5773   int produced;
5774   int produced_chars = 0;
5775   int carryover = 0;
5776
5777   if (! coding->chars_at_source)
5778     {
5779       /* Characters are in coding->charbuf.  */
5780       int *buf = coding->charbuf;
5781       int *buf_end = buf + coding->charbuf_used;
5782
5783       if (BUFFERP (coding->src_object)
5784           && EQ (coding->src_object, coding->dst_object))
5785         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5786
5787       while (buf < buf_end)
5788         {
5789           int c = *buf, i;
5790
5791           if (c >= 0)
5792             {
5793               int from_nchars = 1, to_nchars = 1;
5794               Lisp_Object trans = Qnil;
5795
5796               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5797               if (! NILP (trans))
5798                 {
5799                   trans = get_translation (trans, buf, buf_end, last_block,
5800                                            &from_nchars, &to_nchars);
5801                   if (EQ (trans, Qt))
5802                     break;
5803                   c = *buf;
5804                 }
5805
5806               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5807                 {
5808                   dst = alloc_destination (coding,
5809                                            buf_end - buf
5810                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5811                                            dst);
5812                   dst_end = coding->destination + coding->dst_bytes;
5813                 }
5814
5815               for (i = 0; i < to_nchars; i++)
5816                 {
5817                   if (i > 0)
5818                     c = XINT (AREF (trans, i));
5819                   if (coding->dst_multibyte
5820                       || ! CHAR_BYTE8_P (c))
5821                     CHAR_STRING_ADVANCE (c, dst);
5822                   else
5823                     *dst++ = CHAR_TO_BYTE8 (c);
5824                 }
5825               produced_chars += to_nchars;
5826               *buf++ = to_nchars;
5827               while (--from_nchars > 0)
5828                 *buf++ = 0;
5829             }
5830           else
5831             /* This is an annotation datum.  (-C) is the length.  */
5832             buf += -c;
5833         }
5834       carryover = buf_end - buf;
5835     }
5836   else
5837     {
5838       const unsigned char *src = coding->source;
5839       const unsigned char *src_end = src + coding->src_bytes;
5840       Lisp_Object eol_type;
5841
5842       eol_type = CODING_ID_EOL_TYPE (coding->id);
5843
5844       if (coding->src_multibyte != coding->dst_multibyte)
5845         {
5846           if (coding->src_multibyte)
5847             {
5848               int multibytep = 1;
5849               int consumed_chars;
5850
5851               while (1)
5852                 {
5853                   const unsigned char *src_base = src;
5854                   int c;
5855
5856                   ONE_MORE_BYTE (c);
5857                   if (c == '\r')
5858                     {
5859                       if (EQ (eol_type, Qdos))
5860                         {
5861                           if (src == src_end)
5862                             {
5863                               record_conversion_result
5864                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5865                               goto no_more_source;
5866                             }
5867                           if (*src == '\n')
5868                             c = *src++;
5869                         }
5870                       else if (EQ (eol_type, Qmac))
5871                         c = '\n';
5872                     }
5873                   if (dst == dst_end)
5874                     {
5875                       coding->consumed = src - coding->source;
5876
5877                     if (EQ (coding->src_object, coding->dst_object))
5878                       dst_end = (unsigned char *) src;
5879                     if (dst == dst_end)
5880                       {
5881                         dst = alloc_destination (coding, src_end - src + 1,
5882                                                  dst);
5883                         dst_end = coding->destination + coding->dst_bytes;
5884                         coding_set_source (coding);
5885                         src = coding->source + coding->consumed;
5886                         src_end = coding->source + coding->src_bytes;
5887                       }
5888                     }
5889                   *dst++ = c;
5890                   produced_chars++;
5891                 }
5892             no_more_source:
5893               ;
5894             }
5895           else
5896             while (src < src_end)
5897               {
5898                 int multibytep = 1;
5899                 int c = *src++;
5900
5901                 if (c == '\r')
5902                   {
5903                     if (EQ (eol_type, Qdos))
5904                       {
5905                         if (src < src_end
5906                             && *src == '\n')
5907                           c = *src++;
5908                       }
5909                     else if (EQ (eol_type, Qmac))
5910                       c = '\n';
5911                   }
5912                 if (dst >= dst_end - 1)
5913                   {
5914                     coding->consumed = src - coding->source;
5915
5916                     if (EQ (coding->src_object, coding->dst_object))
5917                       dst_end = (unsigned char *) src;
5918                     if (dst >= dst_end - 1)
5919                       {
5920                         dst = alloc_destination (coding, src_end - src + 2,
5921                                                  dst);
5922                         dst_end = coding->destination + coding->dst_bytes;
5923                         coding_set_source (coding);
5924                         src = coding->source + coding->consumed;
5925                         src_end = coding->source + coding->src_bytes;
5926                       }
5927                   }
5928                 EMIT_ONE_BYTE (c);
5929               }
5930         }
5931       else
5932         {
5933           if (!EQ (coding->src_object, coding->dst_object))
5934             {
5935               int require = coding->src_bytes - coding->dst_bytes;
5936
5937               if (require > 0)
5938                 {
5939                   EMACS_INT offset = src - coding->source;
5940
5941                   dst = alloc_destination (coding, require, dst);
5942                   coding_set_source (coding);
5943                   src = coding->source + offset;
5944                   src_end = coding->source + coding->src_bytes;
5945                 }
5946             }
5947           produced_chars = coding->src_chars;
5948           while (src < src_end)
5949             {
5950               int c = *src++;
5951
5952               if (c == '\r')
5953                 {
5954                   if (EQ (eol_type, Qdos))
5955                     {
5956                       if (src < src_end
5957                           && *src == '\n')
5958                         c = *src++;
5959                       produced_chars--;
5960                     }
5961                   else if (EQ (eol_type, Qmac))
5962                     c = '\n';
5963                 }
5964               *dst++ = c;
5965             }
5966         }
5967       coding->consumed = coding->src_bytes;
5968       coding->consumed_char = coding->src_chars;
5969     }
5970
5971   produced = dst - (coding->destination + coding->produced);
5972   if (BUFFERP (coding->dst_object))
5973     insert_from_gap (produced_chars, produced);
5974   coding->produced += produced;
5975   coding->produced_char += produced_chars;
5976   return carryover;
5977 }
5978
5979 /* Compose text in CODING->object according to the annotation data at
5980    CHARBUF.  CHARBUF is an array:
5981      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5982  */
5983
5984 static INLINE void
5985 produce_composition (coding, charbuf, pos)
5986      struct coding_system *coding;
5987      int *charbuf;
5988      EMACS_INT pos;
5989 {
5990   int len;
5991   EMACS_INT to;
5992   enum composition_method method;
5993   Lisp_Object components;
5994
5995   len = -charbuf[0];
5996   to = pos + charbuf[2];
5997   if (to <= pos)
5998     return;
5999   method = (enum composition_method) (charbuf[3]);
6000
6001   if (method == COMPOSITION_RELATIVE)
6002     components = Qnil;
6003   else if (method >= COMPOSITION_WITH_RULE
6004            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6005     {
6006       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6007       int i;
6008
6009       len -= 4;
6010       charbuf += 4;
6011       for (i = 0; i < len; i++)
6012         {
6013           args[i] = make_number (charbuf[i]);
6014           if (args[i] < 0)
6015             return;
6016         }
6017       components = (method == COMPOSITION_WITH_ALTCHARS
6018                     ? Fstring (len, args) : Fvector (len, args));
6019     }
6020   else
6021     return;
6022   compose_text (pos, to, components, Qnil, coding->dst_object);
6023 }
6024
6025
6026 /* Put `charset' property on text in CODING->object according to
6027    the annotation data at CHARBUF.  CHARBUF is an array:
6028      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6029  */
6030
6031 static INLINE void
6032 produce_charset (coding, charbuf, pos)
6033      struct coding_system *coding;
6034      int *charbuf;
6035      EMACS_INT pos;
6036 {
6037   EMACS_INT from = pos - charbuf[2];
6038   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6039
6040   Fput_text_property (make_number (from), make_number (pos),
6041                       Qcharset, CHARSET_NAME (charset),
6042                       coding->dst_object);
6043 }
6044
6045
6046 #define CHARBUF_SIZE 0x4000
6047
6048 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6049   do {                                                                  \
6050     int size = CHARBUF_SIZE;;                                           \
6051                                                                         \
6052     coding->charbuf = NULL;                                             \
6053     while (size > 1024)                                                 \
6054       {                                                                 \
6055         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6056         if (coding->charbuf)                                            \
6057           break;                                                        \
6058         size >>= 1;                                                     \
6059       }                                                                 \
6060     if (! coding->charbuf)                                              \
6061       {                                                                 \
6062         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6063         return coding->result;                                          \
6064       }                                                                 \
6065     coding->charbuf_size = size;                                        \
6066   } while (0)
6067
6068
6069 static void
6070 produce_annotation (coding, pos)
6071      struct coding_system *coding;
6072      EMACS_INT pos;
6073 {
6074   int *charbuf = coding->charbuf;
6075   int *charbuf_end = charbuf + coding->charbuf_used;
6076
6077   if (NILP (coding->dst_object))
6078     return;
6079
6080   while (charbuf < charbuf_end)
6081     {
6082       if (*charbuf >= 0)
6083         pos += *charbuf++;
6084       else
6085         {
6086           int len = -*charbuf;
6087           switch (charbuf[1])
6088             {
6089             case CODING_ANNOTATE_COMPOSITION_MASK:
6090               produce_composition (coding, charbuf, pos);
6091               break;
6092             case CODING_ANNOTATE_CHARSET_MASK:
6093               produce_charset (coding, charbuf, pos);
6094               break;
6095             default:
6096               abort ();
6097             }
6098           charbuf += len;
6099         }
6100     }
6101 }
6102
6103 /* Decode the data at CODING->src_object into CODING->dst_object.
6104    CODING->src_object is a buffer, a string, or nil.
6105    CODING->dst_object is a buffer.
6106
6107    If CODING->src_object is a buffer, it must be the current buffer.
6108    In this case, if CODING->src_pos is positive, it is a position of
6109    the source text in the buffer, otherwise, the source text is in the
6110    gap area of the buffer, and CODING->src_pos specifies the offset of
6111    the text from GPT (which must be the same as PT).  If this is the
6112    same buffer as CODING->dst_object, CODING->src_pos must be
6113    negative.
6114
6115    If CODING->src_object is a string, CODING->src_pos in an index to
6116    that string.
6117
6118    If CODING->src_object is nil, CODING->source must already point to
6119    the non-relocatable memory area.  In this case, CODING->src_pos is
6120    an offset from CODING->source.
6121
6122    The decoded data is inserted at the current point of the buffer
6123    CODING->dst_object.
6124 */
6125
6126 static int
6127 decode_coding (coding)
6128      struct coding_system *coding;
6129 {
6130   Lisp_Object attrs;
6131   Lisp_Object undo_list;
6132   Lisp_Object translation_table;
6133   int carryover;
6134   int i;
6135
6136   if (BUFFERP (coding->src_object)
6137       && coding->src_pos > 0
6138       && coding->src_pos < GPT
6139       && coding->src_pos + coding->src_chars > GPT)
6140     move_gap_both (coding->src_pos, coding->src_pos_byte);
6141
6142   undo_list = Qt;
6143   if (BUFFERP (coding->dst_object))
6144     {
6145       if (current_buffer != XBUFFER (coding->dst_object))
6146         set_buffer_internal (XBUFFER (coding->dst_object));
6147       if (GPT != PT)
6148         move_gap_both (PT, PT_BYTE);
6149       undo_list = current_buffer->undo_list;
6150       current_buffer->undo_list = Qt;
6151     }
6152
6153   coding->consumed = coding->consumed_char = 0;
6154   coding->produced = coding->produced_char = 0;
6155   coding->chars_at_source = 0;
6156   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6157   coding->errors = 0;
6158
6159   ALLOC_CONVERSION_WORK_AREA (coding);
6160
6161   attrs = CODING_ID_ATTRS (coding->id);
6162   translation_table = get_translation_table (attrs, 0, NULL);
6163
6164   carryover = 0;
6165   do
6166     {
6167       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6168
6169       coding_set_source (coding);
6170       coding->annotated = 0;
6171       coding->charbuf_used = carryover;
6172       (*(coding->decoder)) (coding);
6173       coding_set_destination (coding);
6174       carryover = produce_chars (coding, translation_table, 0);
6175       if (coding->annotated)
6176         produce_annotation (coding, pos);
6177       for (i = 0; i < carryover; i++)
6178         coding->charbuf[i]
6179           = coding->charbuf[coding->charbuf_used - carryover + i];
6180     }
6181   while (coding->consumed < coding->src_bytes
6182          && ! coding->result);
6183
6184   if (carryover > 0)
6185     {
6186       coding_set_destination (coding);
6187       coding->charbuf_used = carryover;
6188       produce_chars (coding, translation_table, 1);
6189     }
6190
6191   coding->carryover_bytes = 0;
6192   if (coding->consumed < coding->src_bytes)
6193     {
6194       int nbytes = coding->src_bytes - coding->consumed;
6195       const unsigned char *src;
6196
6197       coding_set_source (coding);
6198       coding_set_destination (coding);
6199       src = coding->source + coding->consumed;
6200
6201       if (coding->mode & CODING_MODE_LAST_BLOCK)
6202         {
6203           /* Flush out unprocessed data as binary chars.  We are sure
6204              that the number of data is less than the size of
6205              coding->charbuf.  */
6206           coding->charbuf_used = 0;
6207           while (nbytes-- > 0)
6208             {
6209               int c = *src++;
6210
6211               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6212             }
6213           produce_chars (coding, Qnil, 1);
6214         }
6215       else
6216         {
6217           /* Record unprocessed bytes in coding->carryover.  We are
6218              sure that the number of data is less than the size of
6219              coding->carryover.  */
6220           unsigned char *p = coding->carryover;
6221
6222           coding->carryover_bytes = nbytes;
6223           while (nbytes-- > 0)
6224             *p++ = *src++;
6225         }
6226       coding->consumed = coding->src_bytes;
6227     }
6228
6229   if (BUFFERP (coding->dst_object))
6230     {
6231       current_buffer->undo_list = undo_list;
6232       record_insert (coding->dst_pos, coding->produced_char);
6233     }
6234   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6235     decode_eol (coding);
6236   return coding->result;
6237 }
6238
6239
6240 /* Extract an annotation datum from a composition starting at POS and
6241    ending before LIMIT of CODING->src_object (buffer or string), store
6242    the data in BUF, set *STOP to a starting position of the next
6243    composition (if any) or to LIMIT, and return the address of the
6244    next element of BUF.
6245
6246    If such an annotation is not found, set *STOP to a starting
6247    position of a composition after POS (if any) or to LIMIT, and
6248    return BUF.  */
6249
6250 static INLINE int *
6251 handle_composition_annotation (pos, limit, coding, buf, stop)
6252      EMACS_INT pos, limit;
6253      struct coding_system *coding;
6254      int *buf;
6255      EMACS_INT *stop;
6256 {
6257   EMACS_INT start, end;
6258   Lisp_Object prop;
6259
6260   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6261       || end > limit)
6262     *stop = limit;
6263   else if (start > pos)
6264     *stop = start;
6265   else
6266     {
6267       if (start == pos)
6268         {
6269           /* We found a composition.  Store the corresponding
6270              annotation data in BUF.  */
6271           int *head = buf;
6272           enum composition_method method = COMPOSITION_METHOD (prop);
6273           int nchars = COMPOSITION_LENGTH (prop);
6274
6275           ADD_COMPOSITION_DATA (buf, nchars, method);
6276           if (method != COMPOSITION_RELATIVE)
6277             {
6278               Lisp_Object components;
6279               int len, i, i_byte;
6280
6281               components = COMPOSITION_COMPONENTS (prop);
6282               if (VECTORP (components))
6283                 {
6284                   len = XVECTOR (components)->size;
6285                   for (i = 0; i < len; i++)
6286                     *buf++ = XINT (AREF (components, i));
6287                 }
6288               else if (STRINGP (components))
6289                 {
6290                   len = SCHARS (components);
6291                   i = i_byte = 0;
6292                   while (i < len)
6293                     {
6294                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6295                       buf++;
6296                     }
6297                 }
6298               else if (INTEGERP (components))
6299                 {
6300                   len = 1;
6301                   *buf++ = XINT (components);
6302                 }
6303               else if (CONSP (components))
6304                 {
6305                   for (len = 0; CONSP (components);
6306                        len++, components = XCDR (components))
6307                     *buf++ = XINT (XCAR (components));
6308                 }
6309               else
6310                 abort ();
6311               *head -= len;
6312             }
6313         }
6314
6315       if (find_composition (end, limit, &start, &end, &prop,
6316                             coding->src_object)
6317           && end <= limit)
6318         *stop = start;
6319       else
6320         *stop = limit;
6321     }
6322   return buf;
6323 }
6324
6325
6326 /* Extract an annotation datum from a text property `charset' at POS of
6327    CODING->src_object (buffer of string), store the data in BUF, set
6328    *STOP to the position where the value of `charset' property changes
6329    (limiting by LIMIT), and return the address of the next element of
6330    BUF.
6331
6332    If the property value is nil, set *STOP to the position where the
6333    property value is non-nil (limiting by LIMIT), and return BUF.  */
6334
6335 static INLINE int *
6336 handle_charset_annotation (pos, limit, coding, buf, stop)
6337      EMACS_INT pos, limit;
6338      struct coding_system *coding;
6339      int *buf;
6340      EMACS_INT *stop;
6341 {
6342   Lisp_Object val, next;
6343   int id;
6344
6345   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6346   if (! NILP (val) && CHARSETP (val))
6347     id = XINT (CHARSET_SYMBOL_ID (val));
6348   else
6349     id = -1;
6350   ADD_CHARSET_DATA (buf, 0, id);
6351   next = Fnext_single_property_change (make_number (pos), Qcharset,
6352                                        coding->src_object,
6353                                        make_number (limit));
6354   *stop = XINT (next);
6355   return buf;
6356 }
6357
6358
6359 static void
6360 consume_chars (coding, translation_table, max_lookup)
6361      struct coding_system *coding;
6362      Lisp_Object translation_table;
6363      int max_lookup;
6364 {
6365   int *buf = coding->charbuf;
6366   int *buf_end = coding->charbuf + coding->charbuf_size;
6367   const unsigned char *src = coding->source + coding->consumed;
6368   const unsigned char *src_end = coding->source + coding->src_bytes;
6369   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6370   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6371   int multibytep = coding->src_multibyte;
6372   Lisp_Object eol_type;
6373   int c;
6374   EMACS_INT stop, stop_composition, stop_charset;
6375   int *lookup_buf = NULL;
6376
6377   if (! NILP (translation_table))
6378     lookup_buf = alloca (sizeof (int) * max_lookup);
6379
6380   eol_type = CODING_ID_EOL_TYPE (coding->id);
6381   if (VECTORP (eol_type))
6382     eol_type = Qunix;
6383
6384   /* Note: composition handling is not yet implemented.  */
6385   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6386
6387   if (NILP (coding->src_object))
6388     stop = stop_composition = stop_charset = end_pos;
6389   else
6390     {
6391       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6392         stop = stop_composition = pos;
6393       else
6394         stop = stop_composition = end_pos;
6395       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6396         stop = stop_charset = pos;
6397       else
6398         stop_charset = end_pos;
6399     }
6400
6401   /* Compensate for CRLF and conversion.  */
6402   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6403   while (buf < buf_end)
6404     {
6405       Lisp_Object trans;
6406
6407       if (pos == stop)
6408         {
6409           if (pos == end_pos)
6410             break;
6411           if (pos == stop_composition)
6412             buf = handle_composition_annotation (pos, end_pos, coding,
6413                                                  buf, &stop_composition);
6414           if (pos == stop_charset)
6415             buf = handle_charset_annotation (pos, end_pos, coding,
6416                                              buf, &stop_charset);
6417           stop = (stop_composition < stop_charset
6418                   ? stop_composition : stop_charset);
6419         }
6420
6421       if (! multibytep)
6422         {
6423           EMACS_INT bytes;
6424
6425           if (coding->encoder == encode_coding_raw_text)
6426             c = *src++, pos++;
6427           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6428             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6429           else
6430             c = BYTE8_TO_CHAR (*src), src++, pos++;
6431         }
6432       else
6433         c = STRING_CHAR_ADVANCE (src), pos++;
6434       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6435         c = '\n';
6436       if (! EQ (eol_type, Qunix))
6437         {
6438           if (c == '\n')
6439             {
6440               if (EQ (eol_type, Qdos))
6441                 *buf++ = '\r';
6442               else
6443                 c = '\r';
6444             }
6445         }
6446
6447       trans = Qnil;
6448       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6449       if (NILP (trans))
6450         *buf++ = c;
6451       else
6452         {
6453           int from_nchars = 1, to_nchars = 1;
6454           int *lookup_buf_end;
6455           const unsigned char *p = src;
6456           int i;
6457
6458           lookup_buf[0] = c;
6459           for (i = 1; i < max_lookup && p < src_end; i++)
6460             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6461           lookup_buf_end = lookup_buf + i;
6462           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6463                                    &from_nchars, &to_nchars);
6464           if (EQ (trans, Qt)
6465               || buf + to_nchars > buf_end)
6466             break;
6467           *buf++ = *lookup_buf;
6468           for (i = 1; i < to_nchars; i++)
6469             *buf++ = XINT (AREF (trans, i));
6470           for (i = 1; i < from_nchars; i++, pos++)
6471             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6472         }
6473     }
6474
6475   coding->consumed = src - coding->source;
6476   coding->consumed_char = pos - coding->src_pos;
6477   coding->charbuf_used = buf - coding->charbuf;
6478   coding->chars_at_source = 0;
6479 }
6480
6481
6482 /* Encode the text at CODING->src_object into CODING->dst_object.
6483    CODING->src_object is a buffer or a string.
6484    CODING->dst_object is a buffer or nil.
6485
6486    If CODING->src_object is a buffer, it must be the current buffer.
6487    In this case, if CODING->src_pos is positive, it is a position of
6488    the source text in the buffer, otherwise. the source text is in the
6489    gap area of the buffer, and coding->src_pos specifies the offset of
6490    the text from GPT (which must be the same as PT).  If this is the
6491    same buffer as CODING->dst_object, CODING->src_pos must be
6492    negative and CODING should not have `pre-write-conversion'.
6493
6494    If CODING->src_object is a string, CODING should not have
6495    `pre-write-conversion'.
6496
6497    If CODING->dst_object is a buffer, the encoded data is inserted at
6498    the current point of that buffer.
6499
6500    If CODING->dst_object is nil, the encoded data is placed at the
6501    memory area specified by CODING->destination.  */
6502
6503 static int
6504 encode_coding (coding)
6505      struct coding_system *coding;
6506 {
6507   Lisp_Object attrs;
6508   Lisp_Object translation_table;
6509   int max_lookup;
6510
6511   attrs = CODING_ID_ATTRS (coding->id);
6512   if (coding->encoder == encode_coding_raw_text)
6513     translation_table = Qnil, max_lookup = 0;
6514   else
6515     translation_table = get_translation_table (attrs, 1, &max_lookup);
6516
6517   if (BUFFERP (coding->dst_object))
6518     {
6519       set_buffer_internal (XBUFFER (coding->dst_object));
6520       coding->dst_multibyte
6521         = ! NILP (current_buffer->enable_multibyte_characters);
6522     }
6523
6524   coding->consumed = coding->consumed_char = 0;
6525   coding->produced = coding->produced_char = 0;
6526   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6527   coding->errors = 0;
6528
6529   ALLOC_CONVERSION_WORK_AREA (coding);
6530
6531   do {
6532     coding_set_source (coding);
6533     consume_chars (coding, translation_table, max_lookup);
6534     coding_set_destination (coding);
6535     (*(coding->encoder)) (coding);
6536   } while (coding->consumed_char < coding->src_chars);
6537
6538   if (BUFFERP (coding->dst_object))
6539     insert_from_gap (coding->produced_char, coding->produced);
6540
6541   return (coding->result);
6542 }
6543
6544
6545 /* Name (or base name) of work buffer for code conversion.  */
6546 static Lisp_Object Vcode_conversion_workbuf_name;
6547
6548 /* A working buffer used by the top level conversion.  Once it is
6549    created, it is never destroyed.  It has the name
6550    Vcode_conversion_workbuf_name.  The other working buffers are
6551    destroyed after the use is finished, and their names are modified
6552    versions of Vcode_conversion_workbuf_name.  */
6553 static Lisp_Object Vcode_conversion_reused_workbuf;
6554
6555 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6556 static int reused_workbuf_in_use;
6557
6558
6559 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6560    multibyteness of returning buffer.  */
6561
6562 static Lisp_Object
6563 make_conversion_work_buffer (multibyte)
6564      int multibyte;
6565 {
6566   Lisp_Object name, workbuf;
6567   struct buffer *current;
6568
6569   if (reused_workbuf_in_use++)
6570     {
6571       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6572       workbuf = Fget_buffer_create (name);
6573     }
6574   else
6575     {
6576       name = Vcode_conversion_workbuf_name;
6577       workbuf = Fget_buffer_create (name);
6578       if (NILP (Vcode_conversion_reused_workbuf))
6579         Vcode_conversion_reused_workbuf = workbuf;
6580     }
6581   current = current_buffer;
6582   set_buffer_internal (XBUFFER (workbuf));
6583   Ferase_buffer ();
6584   current_buffer->undo_list = Qt;
6585   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6586   set_buffer_internal (current);
6587   return workbuf;
6588 }
6589
6590
6591 static Lisp_Object
6592 code_conversion_restore (arg)
6593      Lisp_Object arg;
6594 {
6595   Lisp_Object current, workbuf;
6596
6597   current = XCAR (arg);
6598   workbuf = XCDR (arg);
6599   if (! NILP (workbuf))
6600     {
6601       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6602         reused_workbuf_in_use = 0;
6603       else if (! NILP (Fbuffer_live_p (workbuf)))
6604         Fkill_buffer (workbuf);
6605     }
6606   set_buffer_internal (XBUFFER (current));
6607   return Qnil;
6608 }
6609
6610 Lisp_Object
6611 code_conversion_save (with_work_buf, multibyte)
6612      int with_work_buf, multibyte;
6613 {
6614   Lisp_Object workbuf = Qnil;
6615
6616   if (with_work_buf)
6617     workbuf = make_conversion_work_buffer (multibyte);
6618   record_unwind_protect (code_conversion_restore,
6619                          Fcons (Fcurrent_buffer (), workbuf));
6620   return workbuf;
6621 }
6622
6623 int
6624 decode_coding_gap (coding, chars, bytes)
6625      struct coding_system *coding;
6626      EMACS_INT chars, bytes;
6627 {
6628   int count = specpdl_ptr - specpdl;
6629   Lisp_Object attrs;
6630
6631   code_conversion_save (0, 0);
6632
6633   coding->src_object = Fcurrent_buffer ();
6634   coding->src_chars = chars;
6635   coding->src_bytes = bytes;
6636   coding->src_pos = -chars;
6637   coding->src_pos_byte = -bytes;
6638   coding->src_multibyte = chars < bytes;
6639   coding->dst_object = coding->src_object;
6640   coding->dst_pos = PT;
6641   coding->dst_pos_byte = PT_BYTE;
6642   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6643   coding->mode |= CODING_MODE_LAST_BLOCK;
6644
6645   if (CODING_REQUIRE_DETECTION (coding))
6646     detect_coding (coding);
6647
6648   decode_coding (coding);
6649
6650   attrs = CODING_ID_ATTRS (coding->id);
6651   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6652     {
6653       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6654       Lisp_Object val;
6655
6656       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6657       val = call1 (CODING_ATTR_POST_READ (attrs),
6658                    make_number (coding->produced_char));
6659       CHECK_NATNUM (val);
6660       coding->produced_char += Z - prev_Z;
6661       coding->produced += Z_BYTE - prev_Z_BYTE;
6662     }
6663
6664   unbind_to (count, Qnil);
6665   return coding->result;
6666 }
6667
6668 int
6669 encode_coding_gap (coding, chars, bytes)
6670      struct coding_system *coding;
6671      EMACS_INT chars, bytes;
6672 {
6673   int count = specpdl_ptr - specpdl;
6674
6675   code_conversion_save (0, 0);
6676
6677   coding->src_object = Fcurrent_buffer ();
6678   coding->src_chars = chars;
6679   coding->src_bytes = bytes;
6680   coding->src_pos = -chars;
6681   coding->src_pos_byte = -bytes;
6682   coding->src_multibyte = chars < bytes;
6683   coding->dst_object = coding->src_object;
6684   coding->dst_pos = PT;
6685   coding->dst_pos_byte = PT_BYTE;
6686
6687   encode_coding (coding);
6688
6689   unbind_to (count, Qnil);
6690   return coding->result;
6691 }
6692
6693
6694 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6695    SRC_OBJECT into DST_OBJECT by coding context CODING.
6696
6697    SRC_OBJECT is a buffer, a string, or Qnil.
6698
6699    If it is a buffer, the text is at point of the buffer.  FROM and TO
6700    are positions in the buffer.
6701
6702    If it is a string, the text is at the beginning of the string.
6703    FROM and TO are indices to the string.
6704
6705    If it is nil, the text is at coding->source.  FROM and TO are
6706    indices to coding->source.
6707
6708    DST_OBJECT is a buffer, Qt, or Qnil.
6709
6710    If it is a buffer, the decoded text is inserted at point of the
6711    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6712    is deleted.
6713
6714    If it is Qt, a string is made from the decoded text, and
6715    set in CODING->dst_object.
6716
6717    If it is Qnil, the decoded text is stored at CODING->destination.
6718    The caller must allocate CODING->dst_bytes bytes at
6719    CODING->destination by xmalloc.  If the decoded text is longer than
6720    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6721  */
6722
6723 void
6724 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6725                       dst_object)
6726      struct coding_system *coding;
6727      Lisp_Object src_object;
6728      EMACS_INT from, from_byte, to, to_byte;
6729      Lisp_Object dst_object;
6730 {
6731   int count = specpdl_ptr - specpdl;
6732   unsigned char *destination;
6733   EMACS_INT dst_bytes;
6734   EMACS_INT chars = to - from;
6735   EMACS_INT bytes = to_byte - from_byte;
6736   Lisp_Object attrs;
6737   Lisp_Object buffer;
6738   int saved_pt = -1, saved_pt_byte;
6739
6740   buffer = Fcurrent_buffer ();
6741
6742   if (NILP (dst_object))
6743     {
6744       destination = coding->destination;
6745       dst_bytes = coding->dst_bytes;
6746     }
6747
6748   coding->src_object = src_object;
6749   coding->src_chars = chars;
6750   coding->src_bytes = bytes;
6751   coding->src_multibyte = chars < bytes;
6752
6753   if (STRINGP (src_object))
6754     {
6755       coding->src_pos = from;
6756       coding->src_pos_byte = from_byte;
6757     }
6758   else if (BUFFERP (src_object))
6759     {
6760       set_buffer_internal (XBUFFER (src_object));
6761       if (from != GPT)
6762         move_gap_both (from, from_byte);
6763       if (EQ (src_object, dst_object))
6764         {
6765           saved_pt = PT, saved_pt_byte = PT_BYTE;
6766           TEMP_SET_PT_BOTH (from, from_byte);
6767           del_range_both (from, from_byte, to, to_byte, 1);
6768           coding->src_pos = -chars;
6769           coding->src_pos_byte = -bytes;
6770         }
6771       else
6772         {
6773           coding->src_pos = from;
6774           coding->src_pos_byte = from_byte;
6775         }
6776     }
6777
6778   if (CODING_REQUIRE_DETECTION (coding))
6779     detect_coding (coding);
6780   attrs = CODING_ID_ATTRS (coding->id);
6781
6782   if (EQ (dst_object, Qt)
6783       || (! NILP (CODING_ATTR_POST_READ (attrs))
6784           && NILP (dst_object)))
6785     {
6786       coding->dst_object = code_conversion_save (1, 1);
6787       coding->dst_pos = BEG;
6788       coding->dst_pos_byte = BEG_BYTE;
6789       coding->dst_multibyte = 1;
6790     }
6791   else if (BUFFERP (dst_object))
6792     {
6793       code_conversion_save (0, 0);
6794       coding->dst_object = dst_object;
6795       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6796       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6797       coding->dst_multibyte
6798         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6799     }
6800   else
6801     {
6802       code_conversion_save (0, 0);
6803       coding->dst_object = Qnil;
6804       coding->dst_multibyte = 1;
6805     }
6806
6807   decode_coding (coding);
6808
6809   if (BUFFERP (coding->dst_object))
6810     set_buffer_internal (XBUFFER (coding->dst_object));
6811
6812   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6813     {
6814       struct gcpro gcpro1, gcpro2;
6815       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6816       Lisp_Object val;
6817
6818       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6819       GCPRO2 (coding->src_object, coding->dst_object);
6820       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6821                         make_number (coding->produced_char));
6822       UNGCPRO;
6823       CHECK_NATNUM (val);
6824       coding->produced_char += Z - prev_Z;
6825       coding->produced += Z_BYTE - prev_Z_BYTE;
6826     }
6827
6828   if (EQ (dst_object, Qt))
6829     {
6830       coding->dst_object = Fbuffer_string ();
6831     }
6832   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6833     {
6834       set_buffer_internal (XBUFFER (coding->dst_object));
6835       if (dst_bytes < coding->produced)
6836         {
6837           destination
6838             = (unsigned char *) xrealloc (destination, coding->produced);
6839           if (! destination)
6840             {
6841               record_conversion_result (coding,
6842                                         CODING_RESULT_INSUFFICIENT_DST);
6843               unbind_to (count, Qnil);
6844               return;
6845             }
6846           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6847             move_gap_both (BEGV, BEGV_BYTE);
6848           bcopy (BEGV_ADDR, destination, coding->produced);
6849           coding->destination = destination;
6850         }
6851     }
6852
6853   if (saved_pt >= 0)
6854     {
6855       /* This is the case of:
6856          (BUFFERP (src_object) && EQ (src_object, dst_object))
6857          As we have moved PT while replacing the original buffer
6858          contents, we must recover it now.  */
6859       set_buffer_internal (XBUFFER (src_object));
6860       if (saved_pt < from)
6861         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6862       else if (saved_pt < from + chars)
6863         TEMP_SET_PT_BOTH (from, from_byte);
6864       else if (! NILP (current_buffer->enable_multibyte_characters))
6865         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6866                           saved_pt_byte + (coding->produced - bytes));
6867       else
6868         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6869                           saved_pt_byte + (coding->produced - bytes));
6870     }
6871
6872   unbind_to (count, coding->dst_object);
6873 }
6874
6875
6876 void
6877 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6878                       dst_object)
6879      struct coding_system *coding;
6880      Lisp_Object src_object;
6881      EMACS_INT from, from_byte, to, to_byte;
6882      Lisp_Object dst_object;
6883 {
6884   int count = specpdl_ptr - specpdl;
6885   EMACS_INT chars = to - from;
6886   EMACS_INT bytes = to_byte - from_byte;
6887   Lisp_Object attrs;
6888   Lisp_Object buffer;
6889   int saved_pt = -1, saved_pt_byte;
6890
6891   buffer = Fcurrent_buffer ();
6892
6893   coding->src_object = src_object;
6894   coding->src_chars = chars;
6895   coding->src_bytes = bytes;
6896   coding->src_multibyte = chars < bytes;
6897
6898   attrs = CODING_ID_ATTRS (coding->id);
6899
6900   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6901     {
6902       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6903       set_buffer_internal (XBUFFER (coding->src_object));
6904       if (STRINGP (src_object))
6905         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6906       else if (BUFFERP (src_object))
6907         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6908       else
6909         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6910
6911       if (EQ (src_object, dst_object))
6912         {
6913           set_buffer_internal (XBUFFER (src_object));
6914           saved_pt = PT, saved_pt_byte = PT_BYTE;
6915           del_range_both (from, from_byte, to, to_byte, 1);
6916           set_buffer_internal (XBUFFER (coding->src_object));
6917         }
6918
6919       {
6920         Lisp_Object args[3];
6921
6922         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6923         args[1] = make_number (BEG);
6924         args[2] = make_number (Z);
6925         safe_call (3, args);
6926       }
6927       coding->src_object = Fcurrent_buffer ();
6928       if (BEG != GPT)
6929         move_gap_both (BEG, BEG_BYTE);
6930       coding->src_chars = Z - BEG;
6931       coding->src_bytes = Z_BYTE - BEG_BYTE;
6932       coding->src_pos = BEG;
6933       coding->src_pos_byte = BEG_BYTE;
6934       coding->src_multibyte = Z < Z_BYTE;
6935     }
6936   else if (STRINGP (src_object))
6937     {
6938       code_conversion_save (0, 0);
6939       coding->src_pos = from;
6940       coding->src_pos_byte = from_byte;
6941     }
6942   else if (BUFFERP (src_object))
6943     {
6944       code_conversion_save (0, 0);
6945       set_buffer_internal (XBUFFER (src_object));
6946       if (EQ (src_object, dst_object))
6947         {
6948           saved_pt = PT, saved_pt_byte = PT_BYTE;
6949           coding->src_object = del_range_1 (from, to, 1, 1);
6950           coding->src_pos = 0;
6951           coding->src_pos_byte = 0;
6952         }
6953       else
6954         {
6955           if (from < GPT && to >= GPT)
6956             move_gap_both (from, from_byte);
6957           coding->src_pos = from;
6958           coding->src_pos_byte = from_byte;
6959         }
6960     }
6961   else
6962     code_conversion_save (0, 0);
6963
6964   if (BUFFERP (dst_object))
6965     {
6966       coding->dst_object = dst_object;
6967       if (EQ (src_object, dst_object))
6968         {
6969           coding->dst_pos = from;
6970           coding->dst_pos_byte = from_byte;
6971         }
6972       else
6973         {
6974           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6975           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6976         }
6977       coding->dst_multibyte
6978         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6979     }
6980   else if (EQ (dst_object, Qt))
6981     {
6982       coding->dst_object = Qnil;
6983       coding->dst_bytes = coding->src_chars;
6984       if (coding->dst_bytes == 0)
6985         coding->dst_bytes = 1;
6986       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6987       coding->dst_multibyte = 0;
6988     }
6989   else
6990     {
6991       coding->dst_object = Qnil;
6992       coding->dst_multibyte = 0;
6993     }
6994
6995   encode_coding (coding);
6996
6997   if (EQ (dst_object, Qt))
6998     {
6999       if (BUFFERP (coding->dst_object))
7000         coding->dst_object = Fbuffer_string ();
7001       else
7002         {
7003           coding->dst_object
7004             = make_unibyte_string ((char *) coding->destination,
7005                                    coding->produced);
7006           xfree (coding->destination);
7007         }
7008     }
7009
7010   if (saved_pt >= 0)
7011     {
7012       /* This is the case of:
7013          (BUFFERP (src_object) && EQ (src_object, dst_object))
7014          As we have moved PT while replacing the original buffer
7015          contents, we must recover it now.  */
7016       set_buffer_internal (XBUFFER (src_object));
7017       if (saved_pt < from)
7018         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7019       else if (saved_pt < from + chars)
7020         TEMP_SET_PT_BOTH (from, from_byte);
7021       else if (! NILP (current_buffer->enable_multibyte_characters))
7022         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7023                           saved_pt_byte + (coding->produced - bytes));
7024       else
7025         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7026                           saved_pt_byte + (coding->produced - bytes));
7027     }
7028
7029   unbind_to (count, Qnil);
7030 }
7031
7032
7033 Lisp_Object
7034 preferred_coding_system ()
7035 {
7036   int id = coding_categories[coding_priorities[0]].id;
7037
7038   return CODING_ID_NAME (id);
7039 }
7040
7041 \f
7042 #ifdef emacs
7043 /*** 8. Emacs Lisp library functions ***/
7044
7045 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7046        doc: /* Return t if OBJECT is nil or a coding-system.
7047 See the documentation of `define-coding-system' for information
7048 about coding-system objects.  */)
7049      (obj)
7050      Lisp_Object obj;
7051 {
7052   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
7053 }
7054
7055 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7056        Sread_non_nil_coding_system, 1, 1, 0,
7057        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7058      (prompt)
7059      Lisp_Object prompt;
7060 {
7061   Lisp_Object val;
7062   do
7063     {
7064       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7065                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7066     }
7067   while (SCHARS (val) == 0);
7068   return (Fintern (val, Qnil));
7069 }
7070
7071 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7072        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7073 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7074      (prompt, default_coding_system)
7075      Lisp_Object prompt, default_coding_system;
7076 {
7077   Lisp_Object val;
7078   if (SYMBOLP (default_coding_system))
7079     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7080   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7081                           Qt, Qnil, Qcoding_system_history,
7082                           default_coding_system, Qnil);
7083   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7084 }
7085
7086 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7087        1, 1, 0,
7088        doc: /* Check validity of CODING-SYSTEM.
7089 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7090 It is valid if it is nil or a symbol defined as a coding system by the
7091 function `define-coding-system'.  */)
7092   (coding_system)
7093      Lisp_Object coding_system;
7094 {
7095   CHECK_SYMBOL (coding_system);
7096   if (!NILP (Fcoding_system_p (coding_system)))
7097     return coding_system;
7098   while (1)
7099     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7100 }
7101
7102 \f
7103 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7104    HIGHEST is nonzero, return the coding system of the highest
7105    priority among the detected coding systems.  Otherwize return a
7106    list of detected coding systems sorted by their priorities.  If
7107    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7108    multibyte form but contains only ASCII and eight-bit chars.
7109    Otherwise, the bytes are raw bytes.
7110
7111    CODING-SYSTEM controls the detection as below:
7112
7113    If it is nil, detect both text-format and eol-format.  If the
7114    text-format part of CODING-SYSTEM is already specified
7115    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7116    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7117    detect only text-format.  */
7118
7119 Lisp_Object
7120 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7121                       coding_system)
7122      const unsigned char *src;
7123      int src_chars, src_bytes, highest;
7124      int multibytep;
7125      Lisp_Object coding_system;
7126 {
7127   const unsigned char *src_end = src + src_bytes;
7128   Lisp_Object attrs, eol_type;
7129   Lisp_Object val;
7130   struct coding_system coding;
7131   int id;
7132   struct coding_detection_info detect_info;
7133   enum coding_category base_category;
7134
7135   if (NILP (coding_system))
7136     coding_system = Qundecided;
7137   setup_coding_system (coding_system, &coding);
7138   attrs = CODING_ID_ATTRS (coding.id);
7139   eol_type = CODING_ID_EOL_TYPE (coding.id);
7140   coding_system = CODING_ATTR_BASE_NAME (attrs);
7141
7142   coding.source = src;
7143   coding.src_chars = src_chars;
7144   coding.src_bytes = src_bytes;
7145   coding.src_multibyte = multibytep;
7146   coding.consumed = 0;
7147   coding.mode |= CODING_MODE_LAST_BLOCK;
7148
7149   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7150
7151   /* At first, detect text-format if necessary.  */
7152   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7153   if (base_category == coding_category_undecided)
7154     {
7155       enum coding_category category;
7156       struct coding_system *this;
7157       int c, i;
7158
7159       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7160       for (i = 0; src < src_end; i++, src++)
7161         {
7162           c = *src;
7163           if (c & 0x80)
7164             break;
7165           if (c < 0x20
7166               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7167               && inhibit_iso_escape_detection)
7168             {
7169               coding.head_ascii = src - coding.source;
7170               if (detect_coding_iso_2022 (&coding, &detect_info))
7171                 {
7172                   /* We have scanned the whole data.  */
7173                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7174                     /* We didn't find an 8-bit code.  */
7175                     src = src_end;
7176                   break;
7177                 }
7178             }
7179         }
7180       coding.head_ascii = src - coding.source;
7181
7182       if (src < src_end
7183           || detect_info.found)
7184         {
7185           if (src == src_end)
7186             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7187             for (i = 0; i < coding_category_raw_text; i++)
7188               {
7189                 category = coding_priorities[i];
7190                 if (detect_info.found & (1 << category))
7191                   break;
7192               }
7193           else
7194             for (i = 0; i < coding_category_raw_text; i++)
7195               {
7196                 category = coding_priorities[i];
7197                 this = coding_categories + category;
7198
7199                 if (this->id < 0)
7200                   {
7201                     /* No coding system of this category is defined.  */
7202                     detect_info.rejected |= (1 << category);
7203                   }
7204                 else if (category >= coding_category_raw_text)
7205                   continue;
7206                 else if (detect_info.checked & (1 << category))
7207                   {
7208                     if (highest
7209                         && (detect_info.found & (1 << category)))
7210                       break;
7211                   }
7212                 else
7213                   {
7214                     if ((*(this->detector)) (&coding, &detect_info)
7215                         && highest
7216                         && (detect_info.found & (1 << category)))
7217                       {
7218                         if (category == coding_category_utf_16_auto)
7219                           {
7220                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7221                               category = coding_category_utf_16_le;
7222                             else
7223                               category = coding_category_utf_16_be;
7224                           }
7225                         break;
7226                       }
7227                   }
7228               }
7229         }
7230
7231       if (detect_info.rejected == CATEGORY_MASK_ANY)
7232         {
7233           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7234           id = coding_categories[coding_category_raw_text].id;
7235           val = Fcons (make_number (id), Qnil);
7236         }
7237       else if (! detect_info.rejected && ! detect_info.found)
7238         {
7239           detect_info.found = CATEGORY_MASK_ANY;
7240           id = coding_categories[coding_category_undecided].id;
7241           val = Fcons (make_number (id), Qnil);
7242         }
7243       else if (highest)
7244         {
7245           if (detect_info.found)
7246             {
7247               detect_info.found = 1 << category;
7248               val = Fcons (make_number (this->id), Qnil);
7249             }
7250           else
7251             for (i = 0; i < coding_category_raw_text; i++)
7252               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7253                 {
7254                   detect_info.found = 1 << coding_priorities[i];
7255                   id = coding_categories[coding_priorities[i]].id;
7256                   val = Fcons (make_number (id), Qnil);
7257                   break;
7258                 }
7259         }
7260       else
7261         {
7262           int mask = detect_info.rejected | detect_info.found;
7263           int found = 0;
7264           val = Qnil;
7265
7266           for (i = coding_category_raw_text - 1; i >= 0; i--)
7267             {
7268               category = coding_priorities[i];
7269               if (! (mask & (1 << category)))
7270                 {
7271                   found |= 1 << category;
7272                   id = coding_categories[category].id;
7273                   val = Fcons (make_number (id), val);
7274                 }
7275             }
7276           for (i = coding_category_raw_text - 1; i >= 0; i--)
7277             {
7278               category = coding_priorities[i];
7279               if (detect_info.found & (1 << category))
7280                 {
7281                   id = coding_categories[category].id;
7282                   val = Fcons (make_number (id), val);
7283                 }
7284             }
7285           detect_info.found |= found;
7286         }
7287     }
7288   else if (base_category == coding_category_utf_16_auto)
7289     {
7290       if (detect_coding_utf_16 (&coding, &detect_info))
7291         {
7292           struct coding_system *this;
7293
7294           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7295             this = coding_categories + coding_category_utf_16_le;
7296           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7297             this = coding_categories + coding_category_utf_16_be;
7298           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7299             this = coding_categories + coding_category_utf_16_be_nosig;
7300           else
7301             this = coding_categories + coding_category_utf_16_le_nosig;
7302           val = Fcons (make_number (this->id), Qnil);
7303         }
7304     }
7305   else
7306     {
7307       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7308       val = Fcons (make_number (coding.id), Qnil);
7309     }
7310
7311   /* Then, detect eol-format if necessary.  */
7312   {
7313     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7314     Lisp_Object tail;
7315
7316     if (VECTORP (eol_type))
7317       {
7318         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7319           normal_eol = detect_eol (coding.source, src_bytes,
7320                                    coding_category_raw_text);
7321         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7322                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7323           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7324                                       coding_category_utf_16_be);
7325         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7326                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7327           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7328                                       coding_category_utf_16_le);
7329       }
7330     else
7331       {
7332         if (EQ (eol_type, Qunix))
7333           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7334         else if (EQ (eol_type, Qdos))
7335           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7336         else
7337           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7338       }
7339
7340     for (tail = val; CONSP (tail); tail = XCDR (tail))
7341       {
7342         enum coding_category category;
7343         int this_eol;
7344
7345         id = XINT (XCAR (tail));
7346         attrs = CODING_ID_ATTRS (id);
7347         category = XINT (CODING_ATTR_CATEGORY (attrs));
7348         eol_type = CODING_ID_EOL_TYPE (id);
7349         if (VECTORP (eol_type))
7350           {
7351             if (category == coding_category_utf_16_be
7352                 || category == coding_category_utf_16_be_nosig)
7353               this_eol = utf_16_be_eol;
7354             else if (category == coding_category_utf_16_le
7355                      || category == coding_category_utf_16_le_nosig)
7356               this_eol = utf_16_le_eol;
7357             else
7358               this_eol = normal_eol;
7359
7360             if (this_eol == EOL_SEEN_LF)
7361               XSETCAR (tail, AREF (eol_type, 0));
7362             else if (this_eol == EOL_SEEN_CRLF)
7363               XSETCAR (tail, AREF (eol_type, 1));
7364             else if (this_eol == EOL_SEEN_CR)
7365               XSETCAR (tail, AREF (eol_type, 2));
7366             else
7367               XSETCAR (tail, CODING_ID_NAME (id));
7368           }
7369         else
7370           XSETCAR (tail, CODING_ID_NAME (id));
7371       }
7372   }
7373
7374   return (highest ? XCAR (val) : val);
7375 }
7376
7377
7378 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7379        2, 3, 0,
7380        doc: /* Detect coding system of the text in the region between START and END.
7381 Return a list of possible coding systems ordered by priority.
7382
7383 If only ASCII characters are found, it returns a list of single element
7384 `undecided' or its subsidiary coding system according to a detected
7385 end-of-line format.
7386
7387 If optional argument HIGHEST is non-nil, return the coding system of
7388 highest priority.  */)
7389      (start, end, highest)
7390      Lisp_Object start, end, highest;
7391 {
7392   int from, to;
7393   int from_byte, to_byte;
7394
7395   CHECK_NUMBER_COERCE_MARKER (start);
7396   CHECK_NUMBER_COERCE_MARKER (end);
7397
7398   validate_region (&start, &end);
7399   from = XINT (start), to = XINT (end);
7400   from_byte = CHAR_TO_BYTE (from);
7401   to_byte = CHAR_TO_BYTE (to);
7402
7403   if (from < GPT && to >= GPT)
7404     move_gap_both (to, to_byte);
7405
7406   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7407                                to - from, to_byte - from_byte,
7408                                !NILP (highest),
7409                                !NILP (current_buffer
7410                                       ->enable_multibyte_characters),
7411                                Qnil);
7412 }
7413
7414 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7415        1, 2, 0,
7416        doc: /* Detect coding system of the text in STRING.
7417 Return a list of possible coding systems ordered by priority.
7418
7419 If only ASCII characters are found, it returns a list of single element
7420 `undecided' or its subsidiary coding system according to a detected
7421 end-of-line format.
7422
7423 If optional argument HIGHEST is non-nil, return the coding system of
7424 highest priority.  */)
7425      (string, highest)
7426      Lisp_Object string, highest;
7427 {
7428   CHECK_STRING (string);
7429
7430   return detect_coding_system (SDATA (string),
7431                                SCHARS (string), SBYTES (string),
7432                                !NILP (highest), STRING_MULTIBYTE (string),
7433                                Qnil);
7434 }
7435
7436
7437 static INLINE int
7438 char_encodable_p (c, attrs)
7439      int c;
7440      Lisp_Object attrs;
7441 {
7442   Lisp_Object tail;
7443   struct charset *charset;
7444   Lisp_Object translation_table;
7445
7446   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7447   if (! NILP (translation_table))
7448     c = translate_char (translation_table, c);
7449   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7450        CONSP (tail); tail = XCDR (tail))
7451     {
7452       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7453       if (CHAR_CHARSET_P (c, charset))
7454         break;
7455     }
7456   return (! NILP (tail));
7457 }
7458
7459
7460 /* Return a list of coding systems that safely encode the text between
7461    START and END.  If EXCLUDE is non-nil, it is a list of coding
7462    systems not to check.  The returned list doesn't contain any such
7463    coding systems.  In any case, if the text contains only ASCII or is
7464    unibyte, return t.  */
7465
7466 DEFUN ("find-coding-systems-region-internal",
7467        Ffind_coding_systems_region_internal,
7468        Sfind_coding_systems_region_internal, 2, 3, 0,
7469        doc: /* Internal use only.  */)
7470      (start, end, exclude)
7471      Lisp_Object start, end, exclude;
7472 {
7473   Lisp_Object coding_attrs_list, safe_codings;
7474   EMACS_INT start_byte, end_byte;
7475   const unsigned char *p, *pbeg, *pend;
7476   int c;
7477   Lisp_Object tail, elt;
7478
7479   if (STRINGP (start))
7480     {
7481       if (!STRING_MULTIBYTE (start)
7482           || SCHARS (start) == SBYTES (start))
7483         return Qt;
7484       start_byte = 0;
7485       end_byte = SBYTES (start);
7486     }
7487   else
7488     {
7489       CHECK_NUMBER_COERCE_MARKER (start);
7490       CHECK_NUMBER_COERCE_MARKER (end);
7491       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7492         args_out_of_range (start, end);
7493       if (NILP (current_buffer->enable_multibyte_characters))
7494         return Qt;
7495       start_byte = CHAR_TO_BYTE (XINT (start));
7496       end_byte = CHAR_TO_BYTE (XINT (end));
7497       if (XINT (end) - XINT (start) == end_byte - start_byte)
7498         return Qt;
7499
7500       if (XINT (start) < GPT && XINT (end) > GPT)
7501         {
7502           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7503             move_gap_both (XINT (start), start_byte);
7504           else
7505             move_gap_both (XINT (end), end_byte);
7506         }
7507     }
7508
7509   coding_attrs_list = Qnil;
7510   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7511     if (NILP (exclude)
7512         || NILP (Fmemq (XCAR (tail), exclude)))
7513       {
7514         Lisp_Object attrs;
7515
7516         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7517         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7518             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7519           {
7520             ASET (attrs, coding_attr_trans_tbl,
7521                   get_translation_table (attrs, 1, NULL));
7522             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7523           }
7524       }
7525
7526   if (STRINGP (start))
7527     p = pbeg = SDATA (start);
7528   else
7529     p = pbeg = BYTE_POS_ADDR (start_byte);
7530   pend = p + (end_byte - start_byte);
7531
7532   while (p < pend && ASCII_BYTE_P (*p)) p++;
7533   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7534
7535   while (p < pend)
7536     {
7537       if (ASCII_BYTE_P (*p))
7538         p++;
7539       else
7540         {
7541           c = STRING_CHAR_ADVANCE (p);
7542
7543           charset_map_loaded = 0;
7544           for (tail = coding_attrs_list; CONSP (tail);)
7545             {
7546               elt = XCAR (tail);
7547               if (NILP (elt))
7548                 tail = XCDR (tail);
7549               else if (char_encodable_p (c, elt))
7550                 tail = XCDR (tail);
7551               else if (CONSP (XCDR (tail)))
7552                 {
7553                   XSETCAR (tail, XCAR (XCDR (tail)));
7554                   XSETCDR (tail, XCDR (XCDR (tail)));
7555                 }
7556               else
7557                 {
7558                   XSETCAR (tail, Qnil);
7559                   tail = XCDR (tail);
7560                 }
7561             }
7562           if (charset_map_loaded)
7563             {
7564               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7565
7566               if (STRINGP (start))
7567                 pbeg = SDATA (start);
7568               else
7569                 pbeg = BYTE_POS_ADDR (start_byte);
7570               p = pbeg + p_offset;
7571               pend = pbeg + pend_offset;
7572             }
7573         }
7574     }
7575
7576   safe_codings = list2 (Qraw_text, Qno_conversion);
7577   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7578     if (! NILP (XCAR (tail)))
7579       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7580
7581   return safe_codings;
7582 }
7583
7584
7585 DEFUN ("unencodable-char-position", Funencodable_char_position,
7586        Sunencodable_char_position, 3, 5, 0,
7587        doc: /*
7588 Return position of first un-encodable character in a region.
7589 START and END specfiy the region and CODING-SYSTEM specifies the
7590 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7591
7592 If optional 4th argument COUNT is non-nil, it specifies at most how
7593 many un-encodable characters to search.  In this case, the value is a
7594 list of positions.
7595
7596 If optional 5th argument STRING is non-nil, it is a string to search
7597 for un-encodable characters.  In that case, START and END are indexes
7598 to the string.  */)
7599      (start, end, coding_system, count, string)
7600      Lisp_Object start, end, coding_system, count, string;
7601 {
7602   int n;
7603   struct coding_system coding;
7604   Lisp_Object attrs, charset_list, translation_table;
7605   Lisp_Object positions;
7606   int from, to;
7607   const unsigned char *p, *stop, *pend;
7608   int ascii_compatible;
7609
7610   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7611   attrs = CODING_ID_ATTRS (coding.id);
7612   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7613     return Qnil;
7614   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7615   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7616   translation_table = get_translation_table (attrs, 1, NULL);
7617
7618   if (NILP (string))
7619     {
7620       validate_region (&start, &end);
7621       from = XINT (start);
7622       to = XINT (end);
7623       if (NILP (current_buffer->enable_multibyte_characters)
7624           || (ascii_compatible
7625               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7626         return Qnil;
7627       p = CHAR_POS_ADDR (from);
7628       pend = CHAR_POS_ADDR (to);
7629       if (from < GPT && to >= GPT)
7630         stop = GPT_ADDR;
7631       else
7632         stop = pend;
7633     }
7634   else
7635     {
7636       CHECK_STRING (string);
7637       CHECK_NATNUM (start);
7638       CHECK_NATNUM (end);
7639       from = XINT (start);
7640       to = XINT (end);
7641       if (from > to
7642           || to > SCHARS (string))
7643         args_out_of_range_3 (string, start, end);
7644       if (! STRING_MULTIBYTE (string))
7645         return Qnil;
7646       p = SDATA (string) + string_char_to_byte (string, from);
7647       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7648       if (ascii_compatible && (to - from) == (pend - p))
7649         return Qnil;
7650     }
7651
7652   if (NILP (count))
7653     n = 1;
7654   else
7655     {
7656       CHECK_NATNUM (count);
7657       n = XINT (count);
7658     }
7659
7660   positions = Qnil;
7661   while (1)
7662     {
7663       int c;
7664
7665       if (ascii_compatible)
7666         while (p < stop && ASCII_BYTE_P (*p))
7667           p++, from++;
7668       if (p >= stop)
7669         {
7670           if (p >= pend)
7671             break;
7672           stop = pend;
7673           p = GAP_END_ADDR;
7674         }
7675
7676       c = STRING_CHAR_ADVANCE (p);
7677       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7678           && ! char_charset (translate_char (translation_table, c),
7679                              charset_list, NULL))
7680         {
7681           positions = Fcons (make_number (from), positions);
7682           n--;
7683           if (n == 0)
7684             break;
7685         }
7686
7687       from++;
7688     }
7689
7690   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7691 }
7692
7693
7694 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7695        Scheck_coding_systems_region, 3, 3, 0,
7696        doc: /* Check if the region is encodable by coding systems.
7697
7698 START and END are buffer positions specifying the region.
7699 CODING-SYSTEM-LIST is a list of coding systems to check.
7700
7701 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7702 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7703 whole region, POS0, POS1, ... are buffer positions where non-encodable
7704 characters are found.
7705
7706 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7707 value is nil.
7708
7709 START may be a string.  In that case, check if the string is
7710 encodable, and the value contains indices to the string instead of
7711 buffer positions.  END is ignored.  */)
7712      (start, end, coding_system_list)
7713      Lisp_Object start, end, coding_system_list;
7714 {
7715   Lisp_Object list;
7716   EMACS_INT start_byte, end_byte;
7717   int pos;
7718   const unsigned char *p, *pbeg, *pend;
7719   int c;
7720   Lisp_Object tail, elt, attrs;
7721
7722   if (STRINGP (start))
7723     {
7724       if (!STRING_MULTIBYTE (start)
7725           && SCHARS (start) != SBYTES (start))
7726         return Qnil;
7727       start_byte = 0;
7728       end_byte = SBYTES (start);
7729       pos = 0;
7730     }
7731   else
7732     {
7733       CHECK_NUMBER_COERCE_MARKER (start);
7734       CHECK_NUMBER_COERCE_MARKER (end);
7735       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7736         args_out_of_range (start, end);
7737       if (NILP (current_buffer->enable_multibyte_characters))
7738         return Qnil;
7739       start_byte = CHAR_TO_BYTE (XINT (start));
7740       end_byte = CHAR_TO_BYTE (XINT (end));
7741       if (XINT (end) - XINT (start) == end_byte - start_byte)
7742         return Qt;
7743
7744       if (XINT (start) < GPT && XINT (end) > GPT)
7745         {
7746           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7747             move_gap_both (XINT (start), start_byte);
7748           else
7749             move_gap_both (XINT (end), end_byte);
7750         }
7751       pos = XINT (start);
7752     }
7753
7754   list = Qnil;
7755   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7756     {
7757       elt = XCAR (tail);
7758       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7759       ASET (attrs, coding_attr_trans_tbl,
7760             get_translation_table (attrs, 1, NULL));
7761       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7762     }
7763
7764   if (STRINGP (start))
7765     p = pbeg = SDATA (start);
7766   else
7767     p = pbeg = BYTE_POS_ADDR (start_byte);
7768   pend = p + (end_byte - start_byte);
7769
7770   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7771   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7772
7773   while (p < pend)
7774     {
7775       if (ASCII_BYTE_P (*p))
7776         p++;
7777       else
7778         {
7779           c = STRING_CHAR_ADVANCE (p);
7780
7781           charset_map_loaded = 0;
7782           for (tail = list; CONSP (tail); tail = XCDR (tail))
7783             {
7784               elt = XCDR (XCAR (tail));
7785               if (! char_encodable_p (c, XCAR (elt)))
7786                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7787             }
7788           if (charset_map_loaded)
7789             {
7790               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7791
7792               if (STRINGP (start))
7793                 pbeg = SDATA (start);
7794               else
7795                 pbeg = BYTE_POS_ADDR (start_byte);
7796               p = pbeg + p_offset;
7797               pend = pbeg + pend_offset;
7798             }
7799         }
7800       pos++;
7801     }
7802
7803   tail = list;
7804   list = Qnil;
7805   for (; CONSP (tail); tail = XCDR (tail))
7806     {
7807       elt = XCAR (tail);
7808       if (CONSP (XCDR (XCDR (elt))))
7809         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7810                       list);
7811     }
7812
7813   return list;
7814 }
7815
7816
7817 Lisp_Object
7818 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7819      Lisp_Object start, end, coding_system, dst_object;
7820      int encodep, norecord;
7821 {
7822   struct coding_system coding;
7823   EMACS_INT from, from_byte, to, to_byte;
7824   Lisp_Object src_object;
7825
7826   CHECK_NUMBER_COERCE_MARKER (start);
7827   CHECK_NUMBER_COERCE_MARKER (end);
7828   if (NILP (coding_system))
7829     coding_system = Qno_conversion;
7830   else
7831     CHECK_CODING_SYSTEM (coding_system);
7832   src_object = Fcurrent_buffer ();
7833   if (NILP (dst_object))
7834     dst_object = src_object;
7835   else if (! EQ (dst_object, Qt))
7836     CHECK_BUFFER (dst_object);
7837
7838   validate_region (&start, &end);
7839   from = XFASTINT (start);
7840   from_byte = CHAR_TO_BYTE (from);
7841   to = XFASTINT (end);
7842   to_byte = CHAR_TO_BYTE (to);
7843
7844   setup_coding_system (coding_system, &coding);
7845   coding.mode |= CODING_MODE_LAST_BLOCK;
7846
7847   if (encodep)
7848     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7849                           dst_object);
7850   else
7851     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7852                           dst_object);
7853   if (! norecord)
7854     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7855
7856   return (BUFFERP (dst_object)
7857           ? make_number (coding.produced_char)
7858           : coding.dst_object);
7859 }
7860
7861
7862 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7863        3, 4, "r\nzCoding system: ",
7864        doc: /* Decode the current region from the specified coding system.
7865 When called from a program, takes four arguments:
7866         START, END, CODING-SYSTEM, and DESTINATION.
7867 START and END are buffer positions.
7868
7869 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7870 If nil, the region between START and END is replace by the decoded text.
7871 If buffer, the decoded text is inserted in the buffer.
7872 If t, the decoded text is returned.
7873
7874 This function sets `last-coding-system-used' to the precise coding system
7875 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7876 not fully specified.)
7877 It returns the length of the decoded text.  */)
7878      (start, end, coding_system, destination)
7879      Lisp_Object start, end, coding_system, destination;
7880 {
7881   return code_convert_region (start, end, coding_system, destination, 0, 0);
7882 }
7883
7884 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7885        3, 4, "r\nzCoding system: ",
7886        doc: /* Encode the current region by specified coding system.
7887 When called from a program, takes three arguments:
7888 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7889
7890 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7891 If nil, the region between START and END is replace by the encoded text.
7892 If buffer, the encoded text is inserted in the buffer.
7893 If t, the encoded text is returned.
7894
7895 This function sets `last-coding-system-used' to the precise coding system
7896 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7897 not fully specified.)
7898 It returns the length of the encoded text.  */)
7899   (start, end, coding_system, destination)
7900      Lisp_Object start, end, coding_system, destination;
7901 {
7902   return code_convert_region (start, end, coding_system, destination, 1, 0);
7903 }
7904
7905 Lisp_Object
7906 code_convert_string (string, coding_system, dst_object,
7907                      encodep, nocopy, norecord)
7908      Lisp_Object string, coding_system, dst_object;
7909      int encodep, nocopy, norecord;
7910 {
7911   struct coding_system coding;
7912   EMACS_INT chars, bytes;
7913
7914   CHECK_STRING (string);
7915   if (NILP (coding_system))
7916     {
7917       if (! norecord)
7918         Vlast_coding_system_used = Qno_conversion;
7919       if (NILP (dst_object))
7920         return (nocopy ? Fcopy_sequence (string) : string);
7921     }
7922
7923   if (NILP (coding_system))
7924     coding_system = Qno_conversion;
7925   else
7926     CHECK_CODING_SYSTEM (coding_system);
7927   if (NILP (dst_object))
7928     dst_object = Qt;
7929   else if (! EQ (dst_object, Qt))
7930     CHECK_BUFFER (dst_object);
7931
7932   setup_coding_system (coding_system, &coding);
7933   coding.mode |= CODING_MODE_LAST_BLOCK;
7934   chars = SCHARS (string);
7935   bytes = SBYTES (string);
7936   if (encodep)
7937     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7938   else
7939     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7940   if (! norecord)
7941     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7942
7943   return (BUFFERP (dst_object)
7944           ? make_number (coding.produced_char)
7945           : coding.dst_object);
7946 }
7947
7948
7949 /* Encode or decode STRING according to CODING_SYSTEM.
7950    Do not set Vlast_coding_system_used.
7951
7952    This function is called only from macros DECODE_FILE and
7953    ENCODE_FILE, thus we ignore character composition.  */
7954
7955 Lisp_Object
7956 code_convert_string_norecord (string, coding_system, encodep)
7957      Lisp_Object string, coding_system;
7958      int encodep;
7959 {
7960   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7961 }
7962
7963
7964 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7965        2, 4, 0,
7966        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7967
7968 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7969 if the decoding operation is trivial.
7970
7971 Optional fourth arg BUFFER non-nil meant that the decoded text is
7972 inserted in BUFFER instead of returned as a string.  In this case,
7973 the return value is BUFFER.
7974
7975 This function sets `last-coding-system-used' to the precise coding system
7976 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7977 not fully specified.  */)
7978   (string, coding_system, nocopy, buffer)
7979      Lisp_Object string, coding_system, nocopy, buffer;
7980 {
7981   return code_convert_string (string, coding_system, buffer,
7982                               0, ! NILP (nocopy), 0);
7983 }
7984
7985 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7986        2, 4, 0,
7987        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7988
7989 Optional third arg NOCOPY non-nil means it is OK to return STRING
7990 itself if the encoding operation is trivial.
7991
7992 Optional fourth arg BUFFER non-nil meant that the encoded text is
7993 inserted in BUFFER instead of returned as a string.  In this case,
7994 the return value is BUFFER.
7995
7996 This function sets `last-coding-system-used' to the precise coding system
7997 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7998 not fully specified.)  */)
7999      (string, coding_system, nocopy, buffer)
8000      Lisp_Object string, coding_system, nocopy, buffer;
8001 {
8002   return code_convert_string (string, coding_system, buffer,
8003                               1, ! NILP (nocopy), 1);
8004 }
8005
8006 \f
8007 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8008        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8009 Return the corresponding character.  */)
8010      (code)
8011      Lisp_Object code;
8012 {
8013   Lisp_Object spec, attrs, val;
8014   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8015   int c;
8016
8017   CHECK_NATNUM (code);
8018   c = XFASTINT (code);
8019   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8020   attrs = AREF (spec, 0);
8021
8022   if (ASCII_BYTE_P (c)
8023       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8024     return code;
8025
8026   val = CODING_ATTR_CHARSET_LIST (attrs);
8027   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8028   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8029   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8030
8031   if (c <= 0x7F)
8032     charset = charset_roman;
8033   else if (c >= 0xA0 && c < 0xDF)
8034     {
8035       charset = charset_kana;
8036       c -= 0x80;
8037     }
8038   else
8039     {
8040       int s1 = c >> 8, s2 = c & 0xFF;
8041
8042       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8043           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8044         error ("Invalid code: %d", code);
8045       SJIS_TO_JIS (c);
8046       charset = charset_kanji;
8047     }
8048   c = DECODE_CHAR (charset, c);
8049   if (c < 0)
8050     error ("Invalid code: %d", code);
8051   return make_number (c);
8052 }
8053
8054
8055 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8056        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8057 Return the corresponding code in SJIS.  */)
8058      (ch)
8059     Lisp_Object ch;
8060 {
8061   Lisp_Object spec, attrs, charset_list;
8062   int c;
8063   struct charset *charset;
8064   unsigned code;
8065
8066   CHECK_CHARACTER (ch);
8067   c = XFASTINT (ch);
8068   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8069   attrs = AREF (spec, 0);
8070
8071   if (ASCII_CHAR_P (c)
8072       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8073     return ch;
8074
8075   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8076   charset = char_charset (c, charset_list, &code);
8077   if (code == CHARSET_INVALID_CODE (charset))
8078     error ("Can't encode by shift_jis encoding: %d", c);
8079   JIS_TO_SJIS (code);
8080
8081   return make_number (code);
8082 }
8083
8084 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8085        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8086 Return the corresponding character.  */)
8087      (code)
8088      Lisp_Object code;
8089 {
8090   Lisp_Object spec, attrs, val;
8091   struct charset *charset_roman, *charset_big5, *charset;
8092   int c;
8093
8094   CHECK_NATNUM (code);
8095   c = XFASTINT (code);
8096   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8097   attrs = AREF (spec, 0);
8098
8099   if (ASCII_BYTE_P (c)
8100       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8101     return code;
8102
8103   val = CODING_ATTR_CHARSET_LIST (attrs);
8104   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8105   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8106
8107   if (c <= 0x7F)
8108     charset = charset_roman;
8109   else
8110     {
8111       int b1 = c >> 8, b2 = c & 0x7F;
8112       if (b1 < 0xA1 || b1 > 0xFE
8113           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8114         error ("Invalid code: %d", code);
8115       charset = charset_big5;
8116     }
8117   c = DECODE_CHAR (charset, (unsigned )c);
8118   if (c < 0)
8119     error ("Invalid code: %d", code);
8120   return make_number (c);
8121 }
8122
8123 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8124        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8125 Return the corresponding character code in Big5.  */)
8126      (ch)
8127      Lisp_Object ch;
8128 {
8129   Lisp_Object spec, attrs, charset_list;
8130   struct charset *charset;
8131   int c;
8132   unsigned code;
8133
8134   CHECK_CHARACTER (ch);
8135   c = XFASTINT (ch);
8136   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8137   attrs = AREF (spec, 0);
8138   if (ASCII_CHAR_P (c)
8139       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8140     return ch;
8141
8142   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8143   charset = char_charset (c, charset_list, &code);
8144   if (code == CHARSET_INVALID_CODE (charset))
8145     error ("Can't encode by Big5 encoding: %d", c);
8146
8147   return make_number (code);
8148 }
8149
8150 \f
8151 DEFUN ("set-terminal-coding-system-internal",
8152        Fset_terminal_coding_system_internal,
8153        Sset_terminal_coding_system_internal, 1, 1, 0,
8154        doc: /* Internal use only.  */)
8155      (coding_system)
8156      Lisp_Object coding_system;
8157 {
8158   CHECK_SYMBOL (coding_system);
8159   setup_coding_system (Fcheck_coding_system (coding_system),
8160                         &terminal_coding);
8161
8162   /* We had better not send unsafe characters to terminal.  */
8163   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8164   /* Characer composition should be disabled.  */
8165   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8166   terminal_coding.src_multibyte = 1;
8167   terminal_coding.dst_multibyte = 0;
8168   return Qnil;
8169 }
8170
8171 DEFUN ("set-safe-terminal-coding-system-internal",
8172        Fset_safe_terminal_coding_system_internal,
8173        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8174        doc: /* Internal use only.  */)
8175      (coding_system)
8176      Lisp_Object coding_system;
8177 {
8178   CHECK_SYMBOL (coding_system);
8179   setup_coding_system (Fcheck_coding_system (coding_system),
8180                        &safe_terminal_coding);
8181   /* Characer composition should be disabled.  */
8182   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8183   safe_terminal_coding.src_multibyte = 1;
8184   safe_terminal_coding.dst_multibyte = 0;
8185   return Qnil;
8186 }
8187
8188 DEFUN ("terminal-coding-system",
8189        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8190        doc: /* Return coding system specified for terminal output.  */)
8191      ()
8192 {
8193   Lisp_Object coding_system;
8194
8195   coding_system = CODING_ID_NAME (terminal_coding.id);
8196   /* For backward compatibility, return nil if it is `undecided'. */
8197   return (coding_system != Qundecided ? coding_system : Qnil);
8198 }
8199
8200 DEFUN ("set-keyboard-coding-system-internal",
8201        Fset_keyboard_coding_system_internal,
8202        Sset_keyboard_coding_system_internal, 1, 1, 0,
8203        doc: /* Internal use only.  */)
8204      (coding_system)
8205      Lisp_Object coding_system;
8206 {
8207   CHECK_SYMBOL (coding_system);
8208   setup_coding_system (Fcheck_coding_system (coding_system),
8209                        &keyboard_coding);
8210   /* Characer composition should be disabled.  */
8211   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8212   return Qnil;
8213 }
8214
8215 DEFUN ("keyboard-coding-system",
8216        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8217        doc: /* Return coding system specified for decoding keyboard input.  */)
8218      ()
8219 {
8220   return CODING_ID_NAME (keyboard_coding.id);
8221 }
8222
8223 \f
8224 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8225        Sfind_operation_coding_system,  1, MANY, 0,
8226        doc: /* Choose a coding system for an operation based on the target name.
8227 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8228 DECODING-SYSTEM is the coding system to use for decoding
8229 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8230 for encoding (in case OPERATION does encoding).
8231
8232 The first argument OPERATION specifies an I/O primitive:
8233   For file I/O, `insert-file-contents' or `write-region'.
8234   For process I/O, `call-process', `call-process-region', or `start-process'.
8235   For network I/O, `open-network-stream'.
8236
8237 The remaining arguments should be the same arguments that were passed
8238 to the primitive.  Depending on which primitive, one of those arguments
8239 is selected as the TARGET.  For example, if OPERATION does file I/O,
8240 whichever argument specifies the file name is TARGET.
8241
8242 TARGET has a meaning which depends on OPERATION:
8243   For file I/O, TARGET is a file name.
8244   For process I/O, TARGET is a process name.
8245   For network I/O, TARGET is a service name or a port number
8246
8247 This function looks up what specified for TARGET in,
8248 `file-coding-system-alist', `process-coding-system-alist',
8249 or `network-coding-system-alist' depending on OPERATION.
8250 They may specify a coding system, a cons of coding systems,
8251 or a function symbol to call.
8252 In the last case, we call the function with one argument,
8253 which is a list of all the arguments given to this function.
8254
8255 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8256      (nargs, args)
8257      int nargs;
8258      Lisp_Object *args;
8259 {
8260   Lisp_Object operation, target_idx, target, val;
8261   register Lisp_Object chain;
8262
8263   if (nargs < 2)
8264     error ("Too few arguments");
8265   operation = args[0];
8266   if (!SYMBOLP (operation)
8267       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8268     error ("Invalid first arguement");
8269   if (nargs < 1 + XINT (target_idx))
8270     error ("Too few arguments for operation: %s",
8271            SDATA (SYMBOL_NAME (operation)));
8272   target = args[XINT (target_idx) + 1];
8273   if (!(STRINGP (target)
8274         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8275     error ("Invalid %dth argument", XINT (target_idx) + 1);
8276
8277   chain = ((EQ (operation, Qinsert_file_contents)
8278             || EQ (operation, Qwrite_region))
8279            ? Vfile_coding_system_alist
8280            : (EQ (operation, Qopen_network_stream)
8281               ? Vnetwork_coding_system_alist
8282               : Vprocess_coding_system_alist));
8283   if (NILP (chain))
8284     return Qnil;
8285
8286   for (; CONSP (chain); chain = XCDR (chain))
8287     {
8288       Lisp_Object elt;
8289
8290       elt = XCAR (chain);
8291       if (CONSP (elt)
8292           && ((STRINGP (target)
8293                && STRINGP (XCAR (elt))
8294                && fast_string_match (XCAR (elt), target) >= 0)
8295               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8296         {
8297           val = XCDR (elt);
8298           /* Here, if VAL is both a valid coding system and a valid
8299              function symbol, we return VAL as a coding system.  */
8300           if (CONSP (val))
8301             return val;
8302           if (! SYMBOLP (val))
8303             return Qnil;
8304           if (! NILP (Fcoding_system_p (val)))
8305             return Fcons (val, val);
8306           if (! NILP (Ffboundp (val)))
8307             {
8308               val = call1 (val, Flist (nargs, args));
8309               if (CONSP (val))
8310                 return val;
8311               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8312                 return Fcons (val, val);
8313             }
8314           return Qnil;
8315         }
8316     }
8317   return Qnil;
8318 }
8319
8320 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8321        Sset_coding_system_priority, 0, MANY, 0,
8322        doc: /* Assign higher priority to the coding systems given as arguments.
8323 If multiple coding systems belongs to the same category,
8324 all but the first one are ignored.
8325
8326 usage: (set-coding-system-priority ...)  */)
8327      (nargs, args)
8328      int nargs;
8329      Lisp_Object *args;
8330 {
8331   int i, j;
8332   int changed[coding_category_max];
8333   enum coding_category priorities[coding_category_max];
8334
8335   bzero (changed, sizeof changed);
8336
8337   for (i = j = 0; i < nargs; i++)
8338     {
8339       enum coding_category category;
8340       Lisp_Object spec, attrs;
8341
8342       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8343       attrs = AREF (spec, 0);
8344       category = XINT (CODING_ATTR_CATEGORY (attrs));
8345       if (changed[category])
8346         /* Ignore this coding system because a coding system of the
8347            same category already had a higher priority.  */
8348         continue;
8349       changed[category] = 1;
8350       priorities[j++] = category;
8351       if (coding_categories[category].id >= 0
8352           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8353         setup_coding_system (args[i], &coding_categories[category]);
8354       Fset (AREF (Vcoding_category_table, category), args[i]);
8355     }
8356
8357   /* Now we have decided top J priorities.  Reflect the order of the
8358      original priorities to the remaining priorities.  */
8359
8360   for (i = j, j = 0; i < coding_category_max; i++, j++)
8361     {
8362       while (j < coding_category_max
8363              && changed[coding_priorities[j]])
8364         j++;
8365       if (j == coding_category_max)
8366         abort ();
8367       priorities[i] = coding_priorities[j];
8368     }
8369
8370   bcopy (priorities, coding_priorities, sizeof priorities);
8371
8372   /* Update `coding-category-list'.  */
8373   Vcoding_category_list = Qnil;
8374   for (i = coding_category_max - 1; i >= 0; i--)
8375     Vcoding_category_list
8376       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8377                Vcoding_category_list);
8378
8379   return Qnil;
8380 }
8381
8382 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8383        Scoding_system_priority_list, 0, 1, 0,
8384        doc: /* Return a list of coding systems ordered by their priorities.
8385 HIGHESTP non-nil means just return the highest priority one.  */)
8386      (highestp)
8387      Lisp_Object highestp;
8388 {
8389   int i;
8390   Lisp_Object val;
8391
8392   for (i = 0, val = Qnil; i < coding_category_max; i++)
8393     {
8394       enum coding_category category = coding_priorities[i];
8395       int id = coding_categories[category].id;
8396       Lisp_Object attrs;
8397
8398       if (id < 0)
8399         continue;
8400       attrs = CODING_ID_ATTRS (id);
8401       if (! NILP (highestp))
8402         return CODING_ATTR_BASE_NAME (attrs);
8403       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8404     }
8405   return Fnreverse (val);
8406 }
8407
8408 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8409
8410 static Lisp_Object
8411 make_subsidiaries (base)
8412      Lisp_Object base;
8413 {
8414   Lisp_Object subsidiaries;
8415   int base_name_len = SBYTES (SYMBOL_NAME (base));
8416   char *buf = (char *) alloca (base_name_len + 6);
8417   int i;
8418
8419   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8420   subsidiaries = Fmake_vector (make_number (3), Qnil);
8421   for (i = 0; i < 3; i++)
8422     {
8423       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8424       ASET (subsidiaries, i, intern (buf));
8425     }
8426   return subsidiaries;
8427 }
8428
8429
8430 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8431        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8432        doc: /* For internal use only.
8433 usage: (define-coding-system-internal ...)  */)
8434      (nargs, args)
8435      int nargs;
8436      Lisp_Object *args;
8437 {
8438   Lisp_Object name;
8439   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8440   Lisp_Object attrs;            /* Vector of attributes.  */
8441   Lisp_Object eol_type;
8442   Lisp_Object aliases;
8443   Lisp_Object coding_type, charset_list, safe_charsets;
8444   enum coding_category category;
8445   Lisp_Object tail, val;
8446   int max_charset_id = 0;
8447   int i;
8448
8449   if (nargs < coding_arg_max)
8450     goto short_args;
8451
8452   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8453
8454   name = args[coding_arg_name];
8455   CHECK_SYMBOL (name);
8456   CODING_ATTR_BASE_NAME (attrs) = name;
8457
8458   val = args[coding_arg_mnemonic];
8459   if (! STRINGP (val))
8460     CHECK_CHARACTER (val);
8461   CODING_ATTR_MNEMONIC (attrs) = val;
8462
8463   coding_type = args[coding_arg_coding_type];
8464   CHECK_SYMBOL (coding_type);
8465   CODING_ATTR_TYPE (attrs) = coding_type;
8466
8467   charset_list = args[coding_arg_charset_list];
8468   if (SYMBOLP (charset_list))
8469     {
8470       if (EQ (charset_list, Qiso_2022))
8471         {
8472           if (! EQ (coding_type, Qiso_2022))
8473             error ("Invalid charset-list");
8474           charset_list = Viso_2022_charset_list;
8475         }
8476       else if (EQ (charset_list, Qemacs_mule))
8477         {
8478           if (! EQ (coding_type, Qemacs_mule))
8479             error ("Invalid charset-list");
8480           charset_list = Vemacs_mule_charset_list;
8481         }
8482       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8483         if (max_charset_id < XFASTINT (XCAR (tail)))
8484           max_charset_id = XFASTINT (XCAR (tail));
8485     }
8486   else
8487     {
8488       charset_list = Fcopy_sequence (charset_list);
8489       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8490         {
8491           struct charset *charset;
8492
8493           val = Fcar (tail);
8494           CHECK_CHARSET_GET_CHARSET (val, charset);
8495           if (EQ (coding_type, Qiso_2022)
8496               ? CHARSET_ISO_FINAL (charset) < 0
8497               : EQ (coding_type, Qemacs_mule)
8498               ? CHARSET_EMACS_MULE_ID (charset) < 0
8499               : 0)
8500             error ("Can't handle charset `%s'",
8501                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8502
8503           XSETCAR (tail, make_number (charset->id));
8504           if (max_charset_id < charset->id)
8505             max_charset_id = charset->id;
8506         }
8507     }
8508   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8509
8510   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8511                                 make_number (255));
8512   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8513     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8514   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8515
8516   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8517
8518   val = args[coding_arg_decode_translation_table];
8519   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8520     CHECK_SYMBOL (val);
8521   CODING_ATTR_DECODE_TBL (attrs) = val;
8522
8523   val = args[coding_arg_encode_translation_table];
8524   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8525     CHECK_SYMBOL (val);
8526   CODING_ATTR_ENCODE_TBL (attrs) = val;
8527
8528   val = args[coding_arg_post_read_conversion];
8529   CHECK_SYMBOL (val);
8530   CODING_ATTR_POST_READ (attrs) = val;
8531
8532   val = args[coding_arg_pre_write_conversion];
8533   CHECK_SYMBOL (val);
8534   CODING_ATTR_PRE_WRITE (attrs) = val;
8535
8536   val = args[coding_arg_default_char];
8537   if (NILP (val))
8538     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8539   else
8540     {
8541       CHECK_CHARACTER (val);
8542       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8543     }
8544
8545   val = args[coding_arg_for_unibyte];
8546   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8547
8548   val = args[coding_arg_plist];
8549   CHECK_LIST (val);
8550   CODING_ATTR_PLIST (attrs) = val;
8551
8552   if (EQ (coding_type, Qcharset))
8553     {
8554       /* Generate a lisp vector of 256 elements.  Each element is nil,
8555          integer, or a list of charset IDs.
8556
8557          If Nth element is nil, the byte code N is invalid in this
8558          coding system.
8559
8560          If Nth element is a number NUM, N is the first byte of a
8561          charset whose ID is NUM.
8562
8563          If Nth element is a list of charset IDs, N is the first byte
8564          of one of them.  The list is sorted by dimensions of the
8565          charsets.  A charset of smaller dimension comes firtst. */
8566       val = Fmake_vector (make_number (256), Qnil);
8567
8568       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8569         {
8570           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8571           int dim = CHARSET_DIMENSION (charset);
8572           int idx = (dim - 1) * 4;
8573
8574           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8575             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8576
8577           for (i = charset->code_space[idx];
8578                i <= charset->code_space[idx + 1]; i++)
8579             {
8580               Lisp_Object tmp, tmp2;
8581               int dim2;
8582
8583               tmp = AREF (val, i);
8584               if (NILP (tmp))
8585                 tmp = XCAR (tail);
8586               else if (NUMBERP (tmp))
8587                 {
8588                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8589                   if (dim < dim2)
8590                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8591                   else
8592                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8593                 }
8594               else
8595                 {
8596                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8597                     {
8598                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8599                       if (dim < dim2)
8600                         break;
8601                     }
8602                   if (NILP (tmp2))
8603                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8604                   else
8605                     {
8606                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8607                       XSETCAR (tmp2, XCAR (tail));
8608                     }
8609                 }
8610               ASET (val, i, tmp);
8611             }
8612         }
8613       ASET (attrs, coding_attr_charset_valids, val);
8614       category = coding_category_charset;
8615     }
8616   else if (EQ (coding_type, Qccl))
8617     {
8618       Lisp_Object valids;
8619
8620       if (nargs < coding_arg_ccl_max)
8621         goto short_args;
8622
8623       val = args[coding_arg_ccl_decoder];
8624       CHECK_CCL_PROGRAM (val);
8625       if (VECTORP (val))
8626         val = Fcopy_sequence (val);
8627       ASET (attrs, coding_attr_ccl_decoder, val);
8628
8629       val = args[coding_arg_ccl_encoder];
8630       CHECK_CCL_PROGRAM (val);
8631       if (VECTORP (val))
8632         val = Fcopy_sequence (val);
8633       ASET (attrs, coding_attr_ccl_encoder, val);
8634
8635       val = args[coding_arg_ccl_valids];
8636       valids = Fmake_string (make_number (256), make_number (0));
8637       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8638         {
8639           int from, to;
8640
8641           val = Fcar (tail);
8642           if (INTEGERP (val))
8643             {
8644               from = to = XINT (val);
8645               if (from < 0 || from > 255)
8646                 args_out_of_range_3 (val, make_number (0), make_number (255));
8647             }
8648           else
8649             {
8650               CHECK_CONS (val);
8651               CHECK_NATNUM_CAR (val);
8652               CHECK_NATNUM_CDR (val);
8653               from = XINT (XCAR (val));
8654               if (from > 255)
8655                 args_out_of_range_3 (XCAR (val),
8656                                      make_number (0), make_number (255));
8657               to = XINT (XCDR (val));
8658               if (to < from || to > 255)
8659                 args_out_of_range_3 (XCDR (val),
8660                                      XCAR (val), make_number (255));
8661             }
8662           for (i = from; i <= to; i++)
8663             SSET (valids, i, 1);
8664         }
8665       ASET (attrs, coding_attr_ccl_valids, valids);
8666
8667       category = coding_category_ccl;
8668     }
8669   else if (EQ (coding_type, Qutf_16))
8670     {
8671       Lisp_Object bom, endian;
8672
8673       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8674
8675       if (nargs < coding_arg_utf16_max)
8676         goto short_args;
8677
8678       bom = args[coding_arg_utf16_bom];
8679       if (! NILP (bom) && ! EQ (bom, Qt))
8680         {
8681           CHECK_CONS (bom);
8682           val = XCAR (bom);
8683           CHECK_CODING_SYSTEM (val);
8684           val = XCDR (bom);
8685           CHECK_CODING_SYSTEM (val);
8686         }
8687       ASET (attrs, coding_attr_utf_16_bom, bom);
8688
8689       endian = args[coding_arg_utf16_endian];
8690       CHECK_SYMBOL (endian);
8691       if (NILP (endian))
8692         endian = Qbig;
8693       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8694         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8695       ASET (attrs, coding_attr_utf_16_endian, endian);
8696
8697       category = (CONSP (bom)
8698                   ? coding_category_utf_16_auto
8699                   : NILP (bom)
8700                   ? (EQ (endian, Qbig)
8701                      ? coding_category_utf_16_be_nosig
8702                      : coding_category_utf_16_le_nosig)
8703                   : (EQ (endian, Qbig)
8704                      ? coding_category_utf_16_be
8705                      : coding_category_utf_16_le));
8706     }
8707   else if (EQ (coding_type, Qiso_2022))
8708     {
8709       Lisp_Object initial, reg_usage, request, flags;
8710       int i;
8711
8712       if (nargs < coding_arg_iso2022_max)
8713         goto short_args;
8714
8715       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8716       CHECK_VECTOR (initial);
8717       for (i = 0; i < 4; i++)
8718         {
8719           val = Faref (initial, make_number (i));
8720           if (! NILP (val))
8721             {
8722               struct charset *charset;
8723
8724               CHECK_CHARSET_GET_CHARSET (val, charset);
8725               ASET (initial, i, make_number (CHARSET_ID (charset)));
8726               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8727                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8728             }
8729           else
8730             ASET (initial, i, make_number (-1));
8731         }
8732
8733       reg_usage = args[coding_arg_iso2022_reg_usage];
8734       CHECK_CONS (reg_usage);
8735       CHECK_NUMBER_CAR (reg_usage);
8736       CHECK_NUMBER_CDR (reg_usage);
8737
8738       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8739       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8740         {
8741           int id;
8742           Lisp_Object tmp;
8743
8744           val = Fcar (tail);
8745           CHECK_CONS (val);
8746           tmp = XCAR (val);
8747           CHECK_CHARSET_GET_ID (tmp, id);
8748           CHECK_NATNUM_CDR (val);
8749           if (XINT (XCDR (val)) >= 4)
8750             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8751           XSETCAR (val, make_number (id));
8752         }
8753
8754       flags = args[coding_arg_iso2022_flags];
8755       CHECK_NATNUM (flags);
8756       i = XINT (flags);
8757       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8758         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8759
8760       ASET (attrs, coding_attr_iso_initial, initial);
8761       ASET (attrs, coding_attr_iso_usage, reg_usage);
8762       ASET (attrs, coding_attr_iso_request, request);
8763       ASET (attrs, coding_attr_iso_flags, flags);
8764       setup_iso_safe_charsets (attrs);
8765
8766       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8767         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8768                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8769                     ? coding_category_iso_7_else
8770                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8771                     ? coding_category_iso_7
8772                     : coding_category_iso_7_tight);
8773       else
8774         {
8775           int id = XINT (AREF (initial, 1));
8776
8777           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8778                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8779                        || id < 0)
8780                       ? coding_category_iso_8_else
8781                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8782                       ? coding_category_iso_8_1
8783                       : coding_category_iso_8_2);
8784         }
8785       if (category != coding_category_iso_8_1
8786           && category != coding_category_iso_8_2)
8787         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8788     }
8789   else if (EQ (coding_type, Qemacs_mule))
8790     {
8791       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8792         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8793       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8794       category = coding_category_emacs_mule;
8795     }
8796   else if (EQ (coding_type, Qshift_jis))
8797     {
8798
8799       struct charset *charset;
8800
8801       if (XINT (Flength (charset_list)) != 3
8802           && XINT (Flength (charset_list)) != 4)
8803         error ("There should be three or four charsets");
8804
8805       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8806       if (CHARSET_DIMENSION (charset) != 1)
8807         error ("Dimension of charset %s is not one",
8808                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8809       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8810         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8811
8812       charset_list = XCDR (charset_list);
8813       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8814       if (CHARSET_DIMENSION (charset) != 1)
8815         error ("Dimension of charset %s is not one",
8816                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8817
8818       charset_list = XCDR (charset_list);
8819       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8820       if (CHARSET_DIMENSION (charset) != 2)
8821         error ("Dimension of charset %s is not two",
8822                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8823
8824       charset_list = XCDR (charset_list);
8825       if (! NILP (charset_list))
8826         {
8827           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8828           if (CHARSET_DIMENSION (charset) != 2)
8829             error ("Dimension of charset %s is not two",
8830                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8831         }
8832
8833       category = coding_category_sjis;
8834       Vsjis_coding_system = name;
8835     }
8836   else if (EQ (coding_type, Qbig5))
8837     {
8838       struct charset *charset;
8839
8840       if (XINT (Flength (charset_list)) != 2)
8841         error ("There should be just two charsets");
8842
8843       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8844       if (CHARSET_DIMENSION (charset) != 1)
8845         error ("Dimension of charset %s is not one",
8846                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8847       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8848         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8849
8850       charset_list = XCDR (charset_list);
8851       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8852       if (CHARSET_DIMENSION (charset) != 2)
8853         error ("Dimension of charset %s is not two",
8854                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8855
8856       category = coding_category_big5;
8857       Vbig5_coding_system = name;
8858     }
8859   else if (EQ (coding_type, Qraw_text))
8860     {
8861       category = coding_category_raw_text;
8862       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8863     }
8864   else if (EQ (coding_type, Qutf_8))
8865     {
8866       category = coding_category_utf_8;
8867       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8868     }
8869   else if (EQ (coding_type, Qundecided))
8870     category = coding_category_undecided;
8871   else
8872     error ("Invalid coding system type: %s",
8873            SDATA (SYMBOL_NAME (coding_type)));
8874
8875   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8876   CODING_ATTR_PLIST (attrs)
8877     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8878                                 CODING_ATTR_PLIST (attrs)));
8879   CODING_ATTR_PLIST (attrs)
8880     = Fcons (QCascii_compatible_p,
8881              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8882                     CODING_ATTR_PLIST (attrs)));
8883
8884   eol_type = args[coding_arg_eol_type];
8885   if (! NILP (eol_type)
8886       && ! EQ (eol_type, Qunix)
8887       && ! EQ (eol_type, Qdos)
8888       && ! EQ (eol_type, Qmac))
8889     error ("Invalid eol-type");
8890
8891   aliases = Fcons (name, Qnil);
8892
8893   if (NILP (eol_type))
8894     {
8895       eol_type = make_subsidiaries (name);
8896       for (i = 0; i < 3; i++)
8897         {
8898           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8899
8900           this_name = AREF (eol_type, i);
8901           this_aliases = Fcons (this_name, Qnil);
8902           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8903           this_spec = Fmake_vector (make_number (3), attrs);
8904           ASET (this_spec, 1, this_aliases);
8905           ASET (this_spec, 2, this_eol_type);
8906           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8907           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8908           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8909                                         Vcoding_system_alist);
8910         }
8911     }
8912
8913   spec_vec = Fmake_vector (make_number (3), attrs);
8914   ASET (spec_vec, 1, aliases);
8915   ASET (spec_vec, 2, eol_type);
8916
8917   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8918   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8919   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8920                                 Vcoding_system_alist);
8921
8922   {
8923     int id = coding_categories[category].id;
8924
8925     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8926       setup_coding_system (name, &coding_categories[category]);
8927   }
8928
8929   return Qnil;
8930
8931  short_args:
8932   return Fsignal (Qwrong_number_of_arguments,
8933                   Fcons (intern ("define-coding-system-internal"),
8934                          make_number (nargs)));
8935 }
8936
8937
8938 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8939        3, 3, 0,
8940        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8941   (coding_system, prop, val)
8942      Lisp_Object coding_system, prop, val;
8943 {
8944   Lisp_Object spec, attrs;
8945
8946   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8947   attrs = AREF (spec, 0);
8948   if (EQ (prop, QCmnemonic))
8949     {
8950       if (! STRINGP (val))
8951         CHECK_CHARACTER (val);
8952       CODING_ATTR_MNEMONIC (attrs) = val;
8953     }
8954   else if (EQ (prop, QCdefalut_char))
8955     {
8956       if (NILP (val))
8957         val = make_number (' ');
8958       else
8959         CHECK_CHARACTER (val);
8960       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8961     }
8962   else if (EQ (prop, QCdecode_translation_table))
8963     {
8964       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8965         CHECK_SYMBOL (val);
8966       CODING_ATTR_DECODE_TBL (attrs) = val;
8967     }
8968   else if (EQ (prop, QCencode_translation_table))
8969     {
8970       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8971         CHECK_SYMBOL (val);
8972       CODING_ATTR_ENCODE_TBL (attrs) = val;
8973     }
8974   else if (EQ (prop, QCpost_read_conversion))
8975     {
8976       CHECK_SYMBOL (val);
8977       CODING_ATTR_POST_READ (attrs) = val;
8978     }
8979   else if (EQ (prop, QCpre_write_conversion))
8980     {
8981       CHECK_SYMBOL (val);
8982       CODING_ATTR_PRE_WRITE (attrs) = val;
8983     }
8984   else if (EQ (prop, QCascii_compatible_p))
8985     {
8986       CODING_ATTR_ASCII_COMPAT (attrs) = val;
8987     }
8988
8989   CODING_ATTR_PLIST (attrs)
8990     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8991   return val;
8992 }
8993
8994
8995 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8996        Sdefine_coding_system_alias, 2, 2, 0,
8997        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8998      (alias, coding_system)
8999      Lisp_Object alias, coding_system;
9000 {
9001   Lisp_Object spec, aliases, eol_type;
9002
9003   CHECK_SYMBOL (alias);
9004   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9005   aliases = AREF (spec, 1);
9006   /* ALISES should be a list of length more than zero, and the first
9007      element is a base coding system.  Append ALIAS at the tail of the
9008      list.  */
9009   while (!NILP (XCDR (aliases)))
9010     aliases = XCDR (aliases);
9011   XSETCDR (aliases, Fcons (alias, Qnil));
9012
9013   eol_type = AREF (spec, 2);
9014   if (VECTORP (eol_type))
9015     {
9016       Lisp_Object subsidiaries;
9017       int i;
9018
9019       subsidiaries = make_subsidiaries (alias);
9020       for (i = 0; i < 3; i++)
9021         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9022                                      AREF (eol_type, i));
9023     }
9024
9025   Fputhash (alias, spec, Vcoding_system_hash_table);
9026   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9027   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9028                                 Vcoding_system_alist);
9029
9030   return Qnil;
9031 }
9032
9033 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9034        1, 1, 0,
9035        doc: /* Return the base of CODING-SYSTEM.
9036 Any alias or subsidiary coding system is not a base coding system.  */)
9037   (coding_system)
9038      Lisp_Object coding_system;
9039 {
9040   Lisp_Object spec, attrs;
9041
9042   if (NILP (coding_system))
9043     return (Qno_conversion);
9044   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9045   attrs = AREF (spec, 0);
9046   return CODING_ATTR_BASE_NAME (attrs);
9047 }
9048
9049 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9050        1, 1, 0,
9051        doc: "Return the property list of CODING-SYSTEM.")
9052      (coding_system)
9053      Lisp_Object coding_system;
9054 {
9055   Lisp_Object spec, attrs;
9056
9057   if (NILP (coding_system))
9058     coding_system = Qno_conversion;
9059   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9060   attrs = AREF (spec, 0);
9061   return CODING_ATTR_PLIST (attrs);
9062 }
9063
9064
9065 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9066        1, 1, 0,
9067        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9068      (coding_system)
9069      Lisp_Object coding_system;
9070 {
9071   Lisp_Object spec;
9072
9073   if (NILP (coding_system))
9074     coding_system = Qno_conversion;
9075   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9076   return AREF (spec, 1);
9077 }
9078
9079 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9080        Scoding_system_eol_type, 1, 1, 0,
9081        doc: /* Return eol-type of CODING-SYSTEM.
9082 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9083
9084 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9085 and CR respectively.
9086
9087 A vector value indicates that a format of end-of-line should be
9088 detected automatically.  Nth element of the vector is the subsidiary
9089 coding system whose eol-type is N.  */)
9090      (coding_system)
9091      Lisp_Object coding_system;
9092 {
9093   Lisp_Object spec, eol_type;
9094   int n;
9095
9096   if (NILP (coding_system))
9097     coding_system = Qno_conversion;
9098   if (! CODING_SYSTEM_P (coding_system))
9099     return Qnil;
9100   spec = CODING_SYSTEM_SPEC (coding_system);
9101   eol_type = AREF (spec, 2);
9102   if (VECTORP (eol_type))
9103     return Fcopy_sequence (eol_type);
9104   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9105   return make_number (n);
9106 }
9107
9108 #endif /* emacs */
9109
9110 \f
9111 /*** 9. Post-amble ***/
9112
9113 void
9114 init_coding_once ()
9115 {
9116   int i;
9117
9118   for (i = 0; i < coding_category_max; i++)
9119     {
9120       coding_categories[i].id = -1;
9121       coding_priorities[i] = i;
9122     }
9123
9124   /* ISO2022 specific initialize routine.  */
9125   for (i = 0; i < 0x20; i++)
9126     iso_code_class[i] = ISO_control_0;
9127   for (i = 0x21; i < 0x7F; i++)
9128     iso_code_class[i] = ISO_graphic_plane_0;
9129   for (i = 0x80; i < 0xA0; i++)
9130     iso_code_class[i] = ISO_control_1;
9131   for (i = 0xA1; i < 0xFF; i++)
9132     iso_code_class[i] = ISO_graphic_plane_1;
9133   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9134   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9135   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9136   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9137   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9138   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9139   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9140   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9141   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9142
9143   for (i = 0; i < 256; i++)
9144     {
9145       emacs_mule_bytes[i] = 1;
9146     }
9147   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9148   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9149   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9150   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9151 }
9152
9153 #ifdef emacs
9154
9155 void
9156 syms_of_coding ()
9157 {
9158   staticpro (&Vcoding_system_hash_table);
9159   {
9160     Lisp_Object args[2];
9161     args[0] = QCtest;
9162     args[1] = Qeq;
9163     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9164   }
9165
9166   staticpro (&Vsjis_coding_system);
9167   Vsjis_coding_system = Qnil;
9168
9169   staticpro (&Vbig5_coding_system);
9170   Vbig5_coding_system = Qnil;
9171
9172   staticpro (&Vcode_conversion_reused_workbuf);
9173   Vcode_conversion_reused_workbuf = Qnil;
9174
9175   staticpro (&Vcode_conversion_workbuf_name);
9176   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9177
9178   reused_workbuf_in_use = 0;
9179
9180   DEFSYM (Qcharset, "charset");
9181   DEFSYM (Qtarget_idx, "target-idx");
9182   DEFSYM (Qcoding_system_history, "coding-system-history");
9183   Fset (Qcoding_system_history, Qnil);
9184
9185   /* Target FILENAME is the first argument.  */
9186   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9187   /* Target FILENAME is the third argument.  */
9188   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9189
9190   DEFSYM (Qcall_process, "call-process");
9191   /* Target PROGRAM is the first argument.  */
9192   Fput (Qcall_process, Qtarget_idx, make_number (0));
9193
9194   DEFSYM (Qcall_process_region, "call-process-region");
9195   /* Target PROGRAM is the third argument.  */
9196   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9197
9198   DEFSYM (Qstart_process, "start-process");
9199   /* Target PROGRAM is the third argument.  */
9200   Fput (Qstart_process, Qtarget_idx, make_number (2));
9201
9202   DEFSYM (Qopen_network_stream, "open-network-stream");
9203   /* Target SERVICE is the fourth argument.  */
9204   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9205
9206   DEFSYM (Qcoding_system, "coding-system");
9207   DEFSYM (Qcoding_aliases, "coding-aliases");
9208
9209   DEFSYM (Qeol_type, "eol-type");
9210   DEFSYM (Qunix, "unix");
9211   DEFSYM (Qdos, "dos");
9212
9213   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9214   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9215   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9216   DEFSYM (Qdefault_char, "default-char");
9217   DEFSYM (Qundecided, "undecided");
9218   DEFSYM (Qno_conversion, "no-conversion");
9219   DEFSYM (Qraw_text, "raw-text");
9220
9221   DEFSYM (Qiso_2022, "iso-2022");
9222
9223   DEFSYM (Qutf_8, "utf-8");
9224   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9225
9226   DEFSYM (Qutf_16, "utf-16");
9227   DEFSYM (Qbig, "big");
9228   DEFSYM (Qlittle, "little");
9229
9230   DEFSYM (Qshift_jis, "shift-jis");
9231   DEFSYM (Qbig5, "big5");
9232
9233   DEFSYM (Qcoding_system_p, "coding-system-p");
9234
9235   DEFSYM (Qcoding_system_error, "coding-system-error");
9236   Fput (Qcoding_system_error, Qerror_conditions,
9237         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9238   Fput (Qcoding_system_error, Qerror_message,
9239         build_string ("Invalid coding system"));
9240
9241   /* Intern this now in case it isn't already done.
9242      Setting this variable twice is harmless.
9243      But don't staticpro it here--that is done in alloc.c.  */
9244   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9245
9246   DEFSYM (Qtranslation_table, "translation-table");
9247   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9248   DEFSYM (Qtranslation_table_id, "translation-table-id");
9249   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9250   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9251
9252   DEFSYM (Qvalid_codes, "valid-codes");
9253
9254   DEFSYM (Qemacs_mule, "emacs-mule");
9255
9256   DEFSYM (QCcategory, ":category");
9257   DEFSYM (QCmnemonic, ":mnemonic");
9258   DEFSYM (QCdefalut_char, ":default-char");
9259   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9260   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9261   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9262   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9263   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9264
9265   Vcoding_category_table
9266     = Fmake_vector (make_number (coding_category_max), Qnil);
9267   staticpro (&Vcoding_category_table);
9268   /* Followings are target of code detection.  */
9269   ASET (Vcoding_category_table, coding_category_iso_7,
9270         intern ("coding-category-iso-7"));
9271   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9272         intern ("coding-category-iso-7-tight"));
9273   ASET (Vcoding_category_table, coding_category_iso_8_1,
9274         intern ("coding-category-iso-8-1"));
9275   ASET (Vcoding_category_table, coding_category_iso_8_2,
9276         intern ("coding-category-iso-8-2"));
9277   ASET (Vcoding_category_table, coding_category_iso_7_else,
9278         intern ("coding-category-iso-7-else"));
9279   ASET (Vcoding_category_table, coding_category_iso_8_else,
9280         intern ("coding-category-iso-8-else"));
9281   ASET (Vcoding_category_table, coding_category_utf_8,
9282         intern ("coding-category-utf-8"));
9283   ASET (Vcoding_category_table, coding_category_utf_16_be,
9284         intern ("coding-category-utf-16-be"));
9285   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9286         intern ("coding-category-utf-16-auto"));
9287   ASET (Vcoding_category_table, coding_category_utf_16_le,
9288         intern ("coding-category-utf-16-le"));
9289   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9290         intern ("coding-category-utf-16-be-nosig"));
9291   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9292         intern ("coding-category-utf-16-le-nosig"));
9293   ASET (Vcoding_category_table, coding_category_charset,
9294         intern ("coding-category-charset"));
9295   ASET (Vcoding_category_table, coding_category_sjis,
9296         intern ("coding-category-sjis"));
9297   ASET (Vcoding_category_table, coding_category_big5,
9298         intern ("coding-category-big5"));
9299   ASET (Vcoding_category_table, coding_category_ccl,
9300         intern ("coding-category-ccl"));
9301   ASET (Vcoding_category_table, coding_category_emacs_mule,
9302         intern ("coding-category-emacs-mule"));
9303   /* Followings are NOT target of code detection.  */
9304   ASET (Vcoding_category_table, coding_category_raw_text,
9305         intern ("coding-category-raw-text"));
9306   ASET (Vcoding_category_table, coding_category_undecided,
9307         intern ("coding-category-undecided"));
9308
9309   DEFSYM (Qinsufficient_source, "insufficient-source");
9310   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9311   DEFSYM (Qinvalid_source, "invalid-source");
9312   DEFSYM (Qinterrupted, "interrupted");
9313   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9314
9315   defsubr (&Scoding_system_p);
9316   defsubr (&Sread_coding_system);
9317   defsubr (&Sread_non_nil_coding_system);
9318   defsubr (&Scheck_coding_system);
9319   defsubr (&Sdetect_coding_region);
9320   defsubr (&Sdetect_coding_string);
9321   defsubr (&Sfind_coding_systems_region_internal);
9322   defsubr (&Sunencodable_char_position);
9323   defsubr (&Scheck_coding_systems_region);
9324   defsubr (&Sdecode_coding_region);
9325   defsubr (&Sencode_coding_region);
9326   defsubr (&Sdecode_coding_string);
9327   defsubr (&Sencode_coding_string);
9328   defsubr (&Sdecode_sjis_char);
9329   defsubr (&Sencode_sjis_char);
9330   defsubr (&Sdecode_big5_char);
9331   defsubr (&Sencode_big5_char);
9332   defsubr (&Sset_terminal_coding_system_internal);
9333   defsubr (&Sset_safe_terminal_coding_system_internal);
9334   defsubr (&Sterminal_coding_system);
9335   defsubr (&Sset_keyboard_coding_system_internal);
9336   defsubr (&Skeyboard_coding_system);
9337   defsubr (&Sfind_operation_coding_system);
9338   defsubr (&Sset_coding_system_priority);
9339   defsubr (&Sdefine_coding_system_internal);
9340   defsubr (&Sdefine_coding_system_alias);
9341   defsubr (&Scoding_system_put);
9342   defsubr (&Scoding_system_base);
9343   defsubr (&Scoding_system_plist);
9344   defsubr (&Scoding_system_aliases);
9345   defsubr (&Scoding_system_eol_type);
9346   defsubr (&Scoding_system_priority_list);
9347
9348   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9349                doc: /* List of coding systems.
9350
9351 Do not alter the value of this variable manually.  This variable should be
9352 updated by the functions `define-coding-system' and
9353 `define-coding-system-alias'.  */);
9354   Vcoding_system_list = Qnil;
9355
9356   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9357                doc: /* Alist of coding system names.
9358 Each element is one element list of coding system name.
9359 This variable is given to `completing-read' as TABLE argument.
9360
9361 Do not alter the value of this variable manually.  This variable should be
9362 updated by the functions `make-coding-system' and
9363 `define-coding-system-alias'.  */);
9364   Vcoding_system_alist = Qnil;
9365
9366   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9367                doc: /* List of coding-categories (symbols) ordered by priority.
9368
9369 On detecting a coding system, Emacs tries code detection algorithms
9370 associated with each coding-category one by one in this order.  When
9371 one algorithm agrees with a byte sequence of source text, the coding
9372 system bound to the corresponding coding-category is selected.
9373
9374 Don't modify this variable directly, but use `set-coding-priority'.  */);
9375   {
9376     int i;
9377
9378     Vcoding_category_list = Qnil;
9379     for (i = coding_category_max - 1; i >= 0; i--)
9380       Vcoding_category_list
9381         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9382                  Vcoding_category_list);
9383   }
9384
9385   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9386                doc: /* Specify the coding system for read operations.
9387 It is useful to bind this variable with `let', but do not set it globally.
9388 If the value is a coding system, it is used for decoding on read operation.
9389 If not, an appropriate element is used from one of the coding system alists:
9390 There are three such tables, `file-coding-system-alist',
9391 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9392   Vcoding_system_for_read = Qnil;
9393
9394   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9395                doc: /* Specify the coding system for write operations.
9396 Programs bind this variable with `let', but you should not set it globally.
9397 If the value is a coding system, it is used for encoding of output,
9398 when writing it to a file and when sending it to a file or subprocess.
9399
9400 If this does not specify a coding system, an appropriate element
9401 is used from one of the coding system alists:
9402 There are three such tables, `file-coding-system-alist',
9403 `process-coding-system-alist', and `network-coding-system-alist'.
9404 For output to files, if the above procedure does not specify a coding system,
9405 the value of `buffer-file-coding-system' is used.  */);
9406   Vcoding_system_for_write = Qnil;
9407
9408   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9409                doc: /*
9410 Coding system used in the latest file or process I/O.  */);
9411   Vlast_coding_system_used = Qnil;
9412
9413   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9414                doc: /*
9415 Error status of the last code conversion.
9416
9417 When an error was detected in the last code conversion, this variable
9418 is set to one of the following symbols.
9419   `insufficient-source'
9420   `inconsistent-eol'
9421   `invalid-source'
9422   `interrupted'
9423   `insufficient-memory'
9424 When no error was detected, the value doesn't change.  So, to check
9425 the error status of a code conversion by this variable, you must
9426 explicitly set this variable to nil before performing code
9427 conversion.  */);
9428   Vlast_code_conversion_error = Qnil;
9429
9430   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9431                doc: /*
9432 *Non-nil means always inhibit code conversion of end-of-line format.
9433 See info node `Coding Systems' and info node `Text and Binary' concerning
9434 such conversion.  */);
9435   inhibit_eol_conversion = 0;
9436
9437   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9438                doc: /*
9439 Non-nil means process buffer inherits coding system of process output.
9440 Bind it to t if the process output is to be treated as if it were a file
9441 read from some filesystem.  */);
9442   inherit_process_coding_system = 0;
9443
9444   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9445                doc: /*
9446 Alist to decide a coding system to use for a file I/O operation.
9447 The format is ((PATTERN . VAL) ...),
9448 where PATTERN is a regular expression matching a file name,
9449 VAL is a coding system, a cons of coding systems, or a function symbol.
9450 If VAL is a coding system, it is used for both decoding and encoding
9451 the file contents.
9452 If VAL is a cons of coding systems, the car part is used for decoding,
9453 and the cdr part is used for encoding.
9454 If VAL is a function symbol, the function must return a coding system
9455 or a cons of coding systems which are used as above.  The function gets
9456 the arguments with which `find-operation-coding-systems' was called.
9457
9458 See also the function `find-operation-coding-system'
9459 and the variable `auto-coding-alist'.  */);
9460   Vfile_coding_system_alist = Qnil;
9461
9462   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9463                doc: /*
9464 Alist to decide a coding system to use for a process I/O operation.
9465 The format is ((PATTERN . VAL) ...),
9466 where PATTERN is a regular expression matching a program name,
9467 VAL is a coding system, a cons of coding systems, or a function symbol.
9468 If VAL is a coding system, it is used for both decoding what received
9469 from the program and encoding what sent to the program.
9470 If VAL is a cons of coding systems, the car part is used for decoding,
9471 and the cdr part is used for encoding.
9472 If VAL is a function symbol, the function must return a coding system
9473 or a cons of coding systems which are used as above.
9474
9475 See also the function `find-operation-coding-system'.  */);
9476   Vprocess_coding_system_alist = Qnil;
9477
9478   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9479                doc: /*
9480 Alist to decide a coding system to use for a network I/O operation.
9481 The format is ((PATTERN . VAL) ...),
9482 where PATTERN is a regular expression matching a network service name
9483 or is a port number to connect to,
9484 VAL is a coding system, a cons of coding systems, or a function symbol.
9485 If VAL is a coding system, it is used for both decoding what received
9486 from the network stream and encoding what sent to the network stream.
9487 If VAL is a cons of coding systems, the car part is used for decoding,
9488 and the cdr part is used for encoding.
9489 If VAL is a function symbol, the function must return a coding system
9490 or a cons of coding systems which are used as above.
9491
9492 See also the function `find-operation-coding-system'.  */);
9493   Vnetwork_coding_system_alist = Qnil;
9494
9495   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9496                doc: /* Coding system to use with system messages.
9497 Also used for decoding keyboard input on X Window system.  */);
9498   Vlocale_coding_system = Qnil;
9499
9500   /* The eol mnemonics are reset in startup.el system-dependently.  */
9501   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9502                doc: /*
9503 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9504   eol_mnemonic_unix = build_string (":");
9505
9506   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9507                doc: /*
9508 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9509   eol_mnemonic_dos = build_string ("\\");
9510
9511   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9512                doc: /*
9513 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9514   eol_mnemonic_mac = build_string ("/");
9515
9516   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9517                doc: /*
9518 *String displayed in mode line when end-of-line format is not yet determined.  */);
9519   eol_mnemonic_undecided = build_string (":");
9520
9521   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9522                doc: /*
9523 *Non-nil enables character translation while encoding and decoding.  */);
9524   Venable_character_translation = Qt;
9525
9526   DEFVAR_LISP ("standard-translation-table-for-decode",
9527                &Vstandard_translation_table_for_decode,
9528                doc: /* Table for translating characters while decoding.  */);
9529   Vstandard_translation_table_for_decode = Qnil;
9530
9531   DEFVAR_LISP ("standard-translation-table-for-encode",
9532                &Vstandard_translation_table_for_encode,
9533                doc: /* Table for translating characters while encoding.  */);
9534   Vstandard_translation_table_for_encode = Qnil;
9535
9536   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9537                doc: /* Alist of charsets vs revision numbers.
9538 While encoding, if a charset (car part of an element) is found,
9539 designate it with the escape sequence identifying revision (cdr part
9540 of the element).  */);
9541   Vcharset_revision_table = Qnil;
9542
9543   DEFVAR_LISP ("default-process-coding-system",
9544                &Vdefault_process_coding_system,
9545                doc: /* Cons of coding systems used for process I/O by default.
9546 The car part is used for decoding a process output,
9547 the cdr part is used for encoding a text to be sent to a process.  */);
9548   Vdefault_process_coding_system = Qnil;
9549
9550   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9551                doc: /*
9552 Table of extra Latin codes in the range 128..159 (inclusive).
9553 This is a vector of length 256.
9554 If Nth element is non-nil, the existence of code N in a file
9555 \(or output of subprocess) doesn't prevent it to be detected as
9556 a coding system of ISO 2022 variant which has a flag
9557 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9558 or reading output of a subprocess.
9559 Only 128th through 159th elements has a meaning.  */);
9560   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9561
9562   DEFVAR_LISP ("select-safe-coding-system-function",
9563                &Vselect_safe_coding_system_function,
9564                doc: /*
9565 Function to call to select safe coding system for encoding a text.
9566
9567 If set, this function is called to force a user to select a proper
9568 coding system which can encode the text in the case that a default
9569 coding system used in each operation can't encode the text.
9570
9571 The default value is `select-safe-coding-system' (which see).  */);
9572   Vselect_safe_coding_system_function = Qnil;
9573
9574   DEFVAR_BOOL ("coding-system-require-warning",
9575                &coding_system_require_warning,
9576                doc: /* Internal use only.
9577 If non-nil, on writing a file, `select-safe-coding-system-function' is
9578 called even if `coding-system-for-write' is non-nil.  The command
9579 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9580   coding_system_require_warning = 0;
9581
9582
9583   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9584                &inhibit_iso_escape_detection,
9585                doc: /*
9586 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9587
9588 By default, on reading a file, Emacs tries to detect how the text is
9589 encoded.  This code detection is sensitive to escape sequences.  If
9590 the sequence is valid as ISO2022, the code is determined as one of
9591 the ISO2022 encodings, and the file is decoded by the corresponding
9592 coding system (e.g. `iso-2022-7bit').
9593
9594 However, there may be a case that you want to read escape sequences in
9595 a file as is.  In such a case, you can set this variable to non-nil.
9596 Then, as the code detection ignores any escape sequences, no file is
9597 detected as encoded in some ISO2022 encoding.  The result is that all
9598 escape sequences become visible in a buffer.
9599
9600 The default value is nil, and it is strongly recommended not to change
9601 it.  That is because many Emacs Lisp source files that contain
9602 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9603 in Emacs's distribution, and they won't be decoded correctly on
9604 reading if you suppress escape sequence detection.
9605
9606 The other way to read escape sequences in a file without decoding is
9607 to explicitly specify some coding system that doesn't use ISO2022's
9608 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9609   inhibit_iso_escape_detection = 0;
9610
9611   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9612                doc: /* Char table for translating self-inserting characters.
9613 This is applied to the result of input methods, not their input.  See also
9614 `keyboard-translate-table'.  */);
9615     Vtranslation_table_for_input = Qnil;
9616
9617   {
9618     Lisp_Object args[coding_arg_max];
9619     Lisp_Object plist[16];
9620     int i;
9621
9622     for (i = 0; i < coding_arg_max; i++)
9623       args[i] = Qnil;
9624
9625     plist[0] = intern (":name");
9626     plist[1] = args[coding_arg_name] = Qno_conversion;
9627     plist[2] = intern (":mnemonic");
9628     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9629     plist[4] = intern (":coding-type");
9630     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9631     plist[6] = intern (":ascii-compatible-p");
9632     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9633     plist[8] = intern (":default-char");
9634     plist[9] = args[coding_arg_default_char] = make_number (0);
9635     plist[10] = intern (":for-unibyte");
9636     plist[11] = args[coding_arg_for_unibyte] = Qt;
9637     plist[12] = intern (":docstring");
9638     plist[13] = build_string ("Do no conversion.\n\
9639 \n\
9640 When you visit a file with this coding, the file is read into a\n\
9641 unibyte buffer as is, thus each byte of a file is treated as a\n\
9642 character.");
9643     plist[14] = intern (":eol-type");
9644     plist[15] = args[coding_arg_eol_type] = Qunix;
9645     args[coding_arg_plist] = Flist (16, plist);
9646     Fdefine_coding_system_internal (coding_arg_max, args);
9647
9648     plist[1] = args[coding_arg_name] = Qundecided;
9649     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9650     plist[5] = args[coding_arg_coding_type] = Qundecided;
9651     /* This is already set.
9652        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9653     plist[8] = intern (":charset-list");
9654     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9655     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9656     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9657     plist[15] = args[coding_arg_eol_type] = Qnil;
9658     args[coding_arg_plist] = Flist (16, plist);
9659     Fdefine_coding_system_internal (coding_arg_max, args);
9660   }
9661
9662   setup_coding_system (Qno_conversion, &keyboard_coding);
9663   setup_coding_system (Qundecided, &terminal_coding);
9664   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9665
9666   {
9667     int i;
9668
9669     for (i = 0; i < coding_category_max; i++)
9670       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9671   }
9672 }
9673
9674 char *
9675 emacs_strerror (error_number)
9676      int error_number;
9677 {
9678   char *str;
9679
9680   synchronize_system_messages_locale ();
9681   str = strerror (error_number);
9682
9683   if (! NILP (Vlocale_coding_system))
9684     {
9685       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9686                                                       Vlocale_coding_system,
9687                                                       0);
9688       str = (char *) SDATA (dec);
9689     }
9690
9691   return str;
9692 }
9693
9694 #endif /* emacs */
9695
9696 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9697    (do not change this comment) */