1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
26 2. Emacs' internal format (emacs-mule) handlers
28 4. Shift-JIS and BIG5 handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
37 /*** 0. General comments ***/
40 /*** GENERAL NOTE on CODING SYSTEM ***
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
49 0. Emacs' internal format (emacs-mule)
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
118 detect_coding_emacs_mule (src
, src_end
)
119 unsigned char *src
, *src_end
;
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
141 Below is a template of these functions. */
144 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
145 struct coding_system
*coding
;
146 unsigned char *source
, *destination
;
147 int src_bytes
, dst_bytes
;
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
169 Below is a template of these functions. */
172 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
173 struct coding_system
*coding
;
174 unsigned char *source
, *destination
;
175 int src_bytes
, dst_bytes
;
181 /*** COMMONLY USED MACROS ***/
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
191 #define ONE_MORE_BYTE(c1) \
193 if (src >= src_end) \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
201 #define TWO_MORE_BYTES(c1, c2) \
203 if (src + 1 >= src_end) \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
223 #define ONE_MORE_CHAR(c) \
225 int len = src_end - src; \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
251 This macro is used in decoding routines. */
253 #define EMIT_CHAR(c) \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
279 #define EMIT_ONE_BYTE(c) \
281 if (dst >= (dst_bytes ? dst_end : src)) \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
289 #define EMIT_TWO_BYTES(c1, c2) \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
296 *dst++ = c1, *dst++ = c2; \
299 #define EMIT_BYTES(from, to) \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
311 /*** 1. Preamble ***/
324 #include "composite.h"
329 #else /* not emacs */
333 #endif /* not emacs */
335 Lisp_Object Qcoding_system
, Qeol_type
;
336 Lisp_Object Qbuffer_file_coding_system
;
337 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
338 Lisp_Object Qno_conversion
, Qundecided
;
339 Lisp_Object Qcoding_system_history
;
340 Lisp_Object Qsafe_charsets
;
341 Lisp_Object Qvalid_codes
;
343 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
344 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
345 Lisp_Object Qstart_process
, Qopen_network_stream
;
346 Lisp_Object Qtarget_idx
;
348 Lisp_Object Vselect_safe_coding_system_function
;
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
352 /* Mnemonic string to indicate format of end-of-line is not yet
354 Lisp_Object eol_mnemonic_undecided
;
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
362 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
364 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule
, Qraw_text
;
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read
;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write
;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used
;
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table
;
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion
;
386 /* Flag to inhibit ISO2022 escape sequence detection. */
387 int inhibit_iso_escape_detection
;
389 /* Flag to make buffer-file-coding-system inherit from process-coding. */
390 int inherit_process_coding_system
;
392 /* Coding system to be used to encode text for terminal display. */
393 struct coding_system terminal_coding
;
395 /* Coding system to be used to encode text for terminal display when
396 terminal coding system is nil. */
397 struct coding_system safe_terminal_coding
;
399 /* Coding system of what is sent from terminal keyboard. */
400 struct coding_system keyboard_coding
;
402 /* Default coding system to be used to write a file. */
403 struct coding_system default_buffer_file_coding
;
405 Lisp_Object Vfile_coding_system_alist
;
406 Lisp_Object Vprocess_coding_system_alist
;
407 Lisp_Object Vnetwork_coding_system_alist
;
409 Lisp_Object Vlocale_coding_system
;
413 Lisp_Object Qcoding_category
, Qcoding_category_index
;
415 /* List of symbols `coding-category-xxx' ordered by priority. */
416 Lisp_Object Vcoding_category_list
;
418 /* Table of coding categories (Lisp symbols). */
419 Lisp_Object Vcoding_category_table
;
421 /* Table of names of symbol for each coding-category. */
422 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
423 "coding-category-emacs-mule",
424 "coding-category-sjis",
425 "coding-category-iso-7",
426 "coding-category-iso-7-tight",
427 "coding-category-iso-8-1",
428 "coding-category-iso-8-2",
429 "coding-category-iso-7-else",
430 "coding-category-iso-8-else",
431 "coding-category-ccl",
432 "coding-category-big5",
433 "coding-category-utf-8",
434 "coding-category-utf-16-be",
435 "coding-category-utf-16-le",
436 "coding-category-raw-text",
437 "coding-category-binary"
440 /* Table of pointers to coding systems corresponding to each coding
442 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
444 /* Table of coding category masks. Nth element is a mask for a coding
445 cateogry of which priority is Nth. */
447 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
449 /* Flag to tell if we look up translation table on character code
451 Lisp_Object Venable_character_translation
;
452 /* Standard translation table to look up on decoding (reading). */
453 Lisp_Object Vstandard_translation_table_for_decode
;
454 /* Standard translation table to look up on encoding (writing). */
455 Lisp_Object Vstandard_translation_table_for_encode
;
457 Lisp_Object Qtranslation_table
;
458 Lisp_Object Qtranslation_table_id
;
459 Lisp_Object Qtranslation_table_for_decode
;
460 Lisp_Object Qtranslation_table_for_encode
;
462 /* Alist of charsets vs revision number. */
463 Lisp_Object Vcharset_revision_alist
;
465 /* Default coding systems used for process I/O. */
466 Lisp_Object Vdefault_process_coding_system
;
468 /* Global flag to tell that we can't call post-read-conversion and
469 pre-write-conversion functions. Usually the value is zero, but it
470 is set to 1 temporarily while such functions are running. This is
471 to avoid infinite recursive call. */
472 static int inhibit_pre_post_conversion
;
475 /*** 2. Emacs internal format (emacs-mule) handlers ***/
477 /* Emacs' internal format for encoding multiple character sets is a
478 kind of multi-byte encoding, i.e. characters are encoded by
479 variable-length sequences of one-byte codes.
481 ASCII characters and control characters (e.g. `tab', `newline') are
482 represented by one-byte sequences which are their ASCII codes, in
483 the range 0x00 through 0x7F.
485 8-bit characters of the range 0x80..0x9F are represented by
486 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
489 8-bit characters of the range 0xA0..0xFF are represented by
490 one-byte sequences which are their 8-bit code.
492 The other characters are represented by a sequence of `base
493 leading-code', optional `extended leading-code', and one or two
494 `position-code's. The length of the sequence is determined by the
495 base leading-code. Leading-code takes the range 0x80 through 0x9F,
496 whereas extended leading-code and position-code take the range 0xA0
497 through 0xFF. See `charset.h' for more details about leading-code
500 --- CODE RANGE of Emacs' internal format ---
504 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
505 eight-bit-graphic 0xA0..0xBF
506 ELSE 0x81..0x9F + [0xA0..0xFF]+
507 ---------------------------------------------
511 enum emacs_code_class_type emacs_code_class
[256];
513 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
514 Check if a text is encoded in Emacs' internal format. If it is,
515 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
518 detect_coding_emacs_mule (src
, src_end
)
519 unsigned char *src
, *src_end
;
523 /* Dummy for ONE_MORE_BYTE. */
524 struct coding_system dummy_coding
;
525 struct coding_system
*coding
= &dummy_coding
;
546 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
549 else if (c
>= 0x80 && c
< 0xA0)
552 /* Old leading code for a composite character. */
556 unsigned char *src_base
= src
- 1;
559 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base
, src_end
- src_base
,
562 src
= src_base
+ bytes
;
567 return CODING_CATEGORY_MASK_EMACS_MULE
;
571 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
574 decode_coding_emacs_mule (coding
, source
, destination
, src_bytes
, dst_bytes
)
575 struct coding_system
*coding
;
576 unsigned char *source
, *destination
;
577 int src_bytes
, dst_bytes
;
579 unsigned char *src
= source
;
580 unsigned char *src_end
= source
+ src_bytes
;
581 unsigned char *dst
= destination
;
582 unsigned char *dst_end
= destination
+ dst_bytes
;
583 /* SRC_BASE remembers the start position in source in each loop.
584 The loop will be exited when there's not enough source code, or
585 when there's not enough destination area to produce a
587 unsigned char *src_base
;
589 coding
->produced_char
= 0;
590 while ((src_base
= src
) < src_end
)
592 unsigned char tmp
[MAX_MULTIBYTE_LENGTH
], *p
;
595 if (UNIBYTE_STR_AS_MULTIBYTE_P (src
, src_end
- src
, bytes
))
602 bytes
= CHAR_STRING (*src
, tmp
);
606 if (dst
+ bytes
>= (dst_bytes
? dst_end
: src
))
608 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
611 while (bytes
--) *dst
++ = *p
++;
612 coding
->produced_char
++;
614 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
615 coding
->produced
= dst
- destination
;
618 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
619 encode_eol (coding, source, destination, src_bytes, dst_bytes)
623 /*** 3. ISO2022 handlers ***/
625 /* The following note describes the coding system ISO2022 briefly.
626 Since the intention of this note is to help understand the
627 functions in this file, some parts are NOT ACCURATE or OVERLY
628 SIMPLIFIED. For thorough understanding, please refer to the
629 original document of ISO2022.
631 ISO2022 provides many mechanisms to encode several character sets
632 in 7-bit and 8-bit environments. For 7-bite environments, all text
633 is encoded using bytes less than 128. This may make the encoded
634 text a little bit longer, but the text passes more easily through
635 several gateways, some of which strip off MSB (Most Signigant Bit).
637 There are two kinds of character sets: control character set and
638 graphic character set. The former contains control characters such
639 as `newline' and `escape' to provide control functions (control
640 functions are also provided by escape sequences). The latter
641 contains graphic characters such as 'A' and '-'. Emacs recognizes
642 two control character sets and many graphic character sets.
644 Graphic character sets are classified into one of the following
645 four classes, according to the number of bytes (DIMENSION) and
646 number of characters in one dimension (CHARS) of the set:
652 In addition, each character set is assigned an identification tag,
653 unique for each set, called "final character" (denoted as <F>
654 hereafter). The <F> of each character set is decided by ECMA(*)
655 when it is registered in ISO. The code range of <F> is 0x30..0x7F
656 (0x30..0x3F are for private use only).
658 Note (*): ECMA = European Computer Manufacturers Association
660 Here are examples of graphic character set [NAME(<F>)]:
661 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
662 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
663 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
664 o DIMENSION2_CHARS96 -- none for the moment
666 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
667 C0 [0x00..0x1F] -- control character plane 0
668 GL [0x20..0x7F] -- graphic character plane 0
669 C1 [0x80..0x9F] -- control character plane 1
670 GR [0xA0..0xFF] -- graphic character plane 1
672 A control character set is directly designated and invoked to C0 or
673 C1 by an escape sequence. The most common case is that:
674 - ISO646's control character set is designated/invoked to C0, and
675 - ISO6429's control character set is designated/invoked to C1,
676 and usually these designations/invocations are omitted in encoded
677 text. In a 7-bit environment, only C0 can be used, and a control
678 character for C1 is encoded by an appropriate escape sequence to
679 fit into the environment. All control characters for C1 are
680 defined to have corresponding escape sequences.
682 A graphic character set is at first designated to one of four
683 graphic registers (G0 through G3), then these graphic registers are
684 invoked to GL or GR. These designations and invocations can be
685 done independently. The most common case is that G0 is invoked to
686 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
687 these invocations and designations are omitted in encoded text.
688 In a 7-bit environment, only GL can be used.
690 When a graphic character set of CHARS94 is invoked to GL, codes
691 0x20 and 0x7F of the GL area work as control characters SPACE and
692 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
695 There are two ways of invocation: locking-shift and single-shift.
696 With locking-shift, the invocation lasts until the next different
697 invocation, whereas with single-shift, the invocation affects the
698 following character only and doesn't affect the locking-shift
699 state. Invocations are done by the following control characters or
702 ----------------------------------------------------------------------
703 abbrev function cntrl escape seq description
704 ----------------------------------------------------------------------
705 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
706 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
707 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
708 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
709 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
710 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
711 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
712 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
713 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
714 ----------------------------------------------------------------------
715 (*) These are not used by any known coding system.
717 Control characters for these functions are defined by macros
718 ISO_CODE_XXX in `coding.h'.
720 Designations are done by the following escape sequences:
721 ----------------------------------------------------------------------
722 escape sequence description
723 ----------------------------------------------------------------------
724 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
725 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
726 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
727 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
728 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
729 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
730 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
731 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
732 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
733 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
734 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
735 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
736 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
737 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
738 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
739 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
740 ----------------------------------------------------------------------
742 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
743 of dimension 1, chars 94, and final character <F>, etc...
745 Note (*): Although these designations are not allowed in ISO2022,
746 Emacs accepts them on decoding, and produces them on encoding
747 CHARS96 character sets in a coding system which is characterized as
748 7-bit environment, non-locking-shift, and non-single-shift.
750 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
751 '(' can be omitted. We refer to this as "short-form" hereafter.
753 Now you may notice that there are a lot of ways for encoding the
754 same multilingual text in ISO2022. Actually, there exist many
755 coding systems such as Compound Text (used in X11's inter client
756 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
757 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
758 localized platforms), and all of these are variants of ISO2022.
760 In addition to the above, Emacs handles two more kinds of escape
761 sequences: ISO6429's direction specification and Emacs' private
762 sequence for specifying character composition.
764 ISO6429's direction specification takes the following form:
765 o CSI ']' -- end of the current direction
766 o CSI '0' ']' -- end of the current direction
767 o CSI '1' ']' -- start of left-to-right text
768 o CSI '2' ']' -- start of right-to-left text
769 The control character CSI (0x9B: control sequence introducer) is
770 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
772 Character composition specification takes the following form:
773 o ESC '0' -- start relative composition
774 o ESC '1' -- end composition
775 o ESC '2' -- start rule-base composition (*)
776 o ESC '3' -- start relative composition with alternate chars (**)
777 o ESC '4' -- start rule-base composition with alternate chars (**)
778 Since these are not standard escape sequences of any ISO standard,
779 the use of them for these meaning is restricted to Emacs only.
781 (*) This form is used only in Emacs 20.5 and the older versions,
782 but the newer versions can safely decode it.
783 (**) This form is used only in Emacs 21.1 and the newer versions,
784 and the older versions can't decode it.
786 Here's a list of examples usages of these composition escape
787 sequences (categorized by `enum composition_method').
789 COMPOSITION_RELATIVE:
790 ESC 0 CHAR [ CHAR ] ESC 1
791 COMPOSITOIN_WITH_RULE:
792 ESC 2 CHAR [ RULE CHAR ] ESC 1
793 COMPOSITION_WITH_ALTCHARS:
794 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
795 COMPOSITION_WITH_RULE_ALTCHARS:
796 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
798 enum iso_code_class_type iso_code_class
[256];
800 #define CHARSET_OK(idx, charset) \
801 (coding_system_table[idx] \
802 && (coding_system_table[idx]->safe_charsets[charset] \
803 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
804 (coding_system_table[idx], charset) \
805 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
807 #define SHIFT_OUT_OK(idx) \
808 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
810 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
811 Check if a text is encoded in ISO2022. If it is, returns an
812 integer in which appropriate flag bits any of:
813 CODING_CATEGORY_MASK_ISO_7
814 CODING_CATEGORY_MASK_ISO_7_TIGHT
815 CODING_CATEGORY_MASK_ISO_8_1
816 CODING_CATEGORY_MASK_ISO_8_2
817 CODING_CATEGORY_MASK_ISO_7_ELSE
818 CODING_CATEGORY_MASK_ISO_8_ELSE
819 are set. If a code which should never appear in ISO2022 is found,
823 detect_coding_iso2022 (src
, src_end
)
824 unsigned char *src
, *src_end
;
826 int mask
= CODING_CATEGORY_MASK_ISO
;
828 int reg
[4], shift_out
= 0, single_shifting
= 0;
829 int c
, c1
, i
, charset
;
830 /* Dummy for ONE_MORE_BYTE. */
831 struct coding_system dummy_coding
;
832 struct coding_system
*coding
= &dummy_coding
;
834 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
835 while (mask
&& src
< src_end
)
841 if (inhibit_iso_escape_detection
)
845 if (c
>= '(' && c
<= '/')
847 /* Designation sequence for a charset of dimension 1. */
849 if (c1
< ' ' || c1
>= 0x80
850 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
851 /* Invalid designation sequence. Just ignore. */
853 reg
[(c
- '(') % 4] = charset
;
857 /* Designation sequence for a charset of dimension 2. */
859 if (c
>= '@' && c
<= 'B')
860 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
861 reg
[0] = charset
= iso_charset_table
[1][0][c
];
862 else if (c
>= '(' && c
<= '/')
865 if (c1
< ' ' || c1
>= 0x80
866 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
867 /* Invalid designation sequence. Just ignore. */
869 reg
[(c
- '(') % 4] = charset
;
872 /* Invalid designation sequence. Just ignore. */
875 else if (c
== 'N' || c
== 'O')
877 /* ESC <Fe> for SS2 or SS3. */
878 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
881 else if (c
>= '0' && c
<= '4')
883 /* ESC <Fp> for start/end composition. */
884 mask_found
|= CODING_CATEGORY_MASK_ISO
;
888 /* Invalid escape sequence. Just ignore. */
891 /* We found a valid designation sequence for CHARSET. */
892 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
894 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
896 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
898 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
900 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
902 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
904 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
905 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
906 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
908 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
912 if (inhibit_iso_escape_detection
)
917 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
918 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
920 /* Locking shift out. */
921 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
922 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
927 if (inhibit_iso_escape_detection
)
932 /* Locking shift in. */
933 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
934 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
943 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
945 if (inhibit_iso_escape_detection
)
947 if (c
!= ISO_CODE_CSI
)
949 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
950 & CODING_FLAG_ISO_SINGLE_SHIFT
)
951 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
952 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
953 & CODING_FLAG_ISO_SINGLE_SHIFT
)
954 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
957 if (VECTORP (Vlatin_extra_code_table
)
958 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
960 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
961 & CODING_FLAG_ISO_LATIN_EXTRA
)
962 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
963 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
964 & CODING_FLAG_ISO_LATIN_EXTRA
)
965 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
968 mask_found
|= newmask
;
981 if (VECTORP (Vlatin_extra_code_table
)
982 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
986 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
987 & CODING_FLAG_ISO_LATIN_EXTRA
)
988 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
989 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
990 & CODING_FLAG_ISO_LATIN_EXTRA
)
991 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
993 mask_found
|= newmask
;
1000 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
1001 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
1002 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
1003 /* Check the length of succeeding codes of the range
1004 0xA0..0FF. If the byte length is odd, we exclude
1005 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1006 when we are not single shifting. */
1007 if (!single_shifting
1008 && mask
& CODING_CATEGORY_MASK_ISO_8_2
)
1011 while (src
< src_end
)
1019 if (i
& 1 && src
< src_end
)
1020 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
1022 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
1029 return (mask
& mask_found
);
1032 /* Decode a character of which charset is CHARSET, the 1st position
1033 code is C1, the 2nd position code is C2, and return the decoded
1034 character code. If the variable `translation_table' is non-nil,
1035 returned the translated code. */
1037 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1038 (NILP (translation_table) \
1039 ? MAKE_CHAR (charset, c1, c2) \
1040 : translate_char (translation_table, -1, charset, c1, c2))
1042 /* Set designation state into CODING. */
1043 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1047 if (final_char < '0' || final_char >= 128) \
1048 goto label_invalid_code; \
1049 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1050 make_number (chars), \
1051 make_number (final_char)); \
1053 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1054 || coding->safe_charsets[charset])) \
1056 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1058 && charset == CHARSET_ASCII) \
1060 /* We should insert this designation sequence as is so \
1061 that it is surely written back to a file. */ \
1062 coding->spec.iso2022.last_invalid_designation_register = -1; \
1063 goto label_invalid_code; \
1065 coding->spec.iso2022.last_invalid_designation_register = -1; \
1066 if ((coding->mode & CODING_MODE_DIRECTION) \
1067 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1068 charset = CHARSET_REVERSE_CHARSET (charset); \
1069 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1073 coding->spec.iso2022.last_invalid_designation_register = reg; \
1074 goto label_invalid_code; \
1078 /* Allocate a memory block for storing information about compositions.
1079 The block is chained to the already allocated blocks. */
1082 coding_allocate_composition_data (coding
, char_offset
)
1083 struct coding_system
*coding
;
1086 struct composition_data
*cmp_data
1087 = (struct composition_data
*) xmalloc (sizeof *cmp_data
);
1089 cmp_data
->char_offset
= char_offset
;
1091 cmp_data
->prev
= coding
->cmp_data
;
1092 cmp_data
->next
= NULL
;
1093 if (coding
->cmp_data
)
1094 coding
->cmp_data
->next
= cmp_data
;
1095 coding
->cmp_data
= cmp_data
;
1096 coding
->cmp_data_start
= 0;
1099 /* Record the starting position START and METHOD of one composition. */
1101 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1103 struct composition_data *cmp_data = coding->cmp_data; \
1104 int *data = cmp_data->data + cmp_data->used; \
1105 coding->cmp_data_start = cmp_data->used; \
1107 data[1] = cmp_data->char_offset + start; \
1108 data[3] = (int) method; \
1109 cmp_data->used += 4; \
1112 /* Record the ending position END of the current composition. */
1114 #define CODING_ADD_COMPOSITION_END(coding, end) \
1116 struct composition_data *cmp_data = coding->cmp_data; \
1117 int *data = cmp_data->data + coding->cmp_data_start; \
1118 data[0] = cmp_data->used - coding->cmp_data_start; \
1119 data[2] = cmp_data->char_offset + end; \
1122 /* Record one COMPONENT (alternate character or composition rule). */
1124 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1125 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1127 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1129 #define DECODE_COMPOSITION_START(c1) \
1131 if (coding->composing == COMPOSITION_DISABLED) \
1133 *dst++ = ISO_CODE_ESC; \
1134 *dst++ = c1 & 0x7f; \
1135 coding->produced_char += 2; \
1137 else if (!COMPOSING_P (coding)) \
1139 /* This is surely the start of a composition. We must be sure \
1140 that coding->cmp_data has enough space to store the \
1141 information about the composition. If not, terminate the \
1142 current decoding loop, allocate one more memory block for \
1143 coding->cmp_data in the calller, then start the decoding \
1144 loop again. We can't allocate memory here directly because \
1145 it may cause buffer/string relocation. */ \
1146 if (!coding->cmp_data \
1147 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1148 >= COMPOSITION_DATA_SIZE)) \
1150 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1151 goto label_end_of_loop; \
1153 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1154 : c1 == '2' ? COMPOSITION_WITH_RULE \
1155 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1156 : COMPOSITION_WITH_RULE_ALTCHARS); \
1157 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1158 coding->composing); \
1159 coding->composition_rule_follows = 0; \
1163 /* We are already handling a composition. If the method is \
1164 the following two, the codes following the current escape \
1165 sequence are actual characters stored in a buffer. */ \
1166 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1167 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1169 coding->composing = COMPOSITION_RELATIVE; \
1170 coding->composition_rule_follows = 0; \
1175 /* Handle compositoin end sequence ESC 1. */
1177 #define DECODE_COMPOSITION_END(c1) \
1179 if (coding->composing == COMPOSITION_DISABLED) \
1181 *dst++ = ISO_CODE_ESC; \
1183 coding->produced_char += 2; \
1187 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1188 coding->composing = COMPOSITION_NO; \
1192 /* Decode a composition rule from the byte C1 (and maybe one more byte
1193 from SRC) and store one encoded composition rule in
1194 coding->cmp_data. */
1196 #define DECODE_COMPOSITION_RULE(c1) \
1200 if (c1 < 81) /* old format (before ver.21) */ \
1202 int gref = (c1) / 9; \
1203 int nref = (c1) % 9; \
1204 if (gref == 4) gref = 10; \
1205 if (nref == 4) nref = 10; \
1206 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1208 else if (c1 < 93) /* new format (after ver.21) */ \
1210 ONE_MORE_BYTE (c2); \
1211 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1213 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1214 coding->composition_rule_follows = 0; \
1218 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1221 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1222 struct coding_system
*coding
;
1223 unsigned char *source
, *destination
;
1224 int src_bytes
, dst_bytes
;
1226 unsigned char *src
= source
;
1227 unsigned char *src_end
= source
+ src_bytes
;
1228 unsigned char *dst
= destination
;
1229 unsigned char *dst_end
= destination
+ dst_bytes
;
1230 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1231 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1232 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1233 /* SRC_BASE remembers the start position in source in each loop.
1234 The loop will be exited when there's not enough source code
1235 (within macro ONE_MORE_BYTE), or when there's not enough
1236 destination area to produce a character (within macro
1238 unsigned char *src_base
;
1240 Lisp_Object translation_table
;
1242 if (NILP (Venable_character_translation
))
1243 translation_table
= Qnil
;
1246 translation_table
= coding
->translation_table_for_decode
;
1247 if (NILP (translation_table
))
1248 translation_table
= Vstandard_translation_table_for_decode
;
1251 coding
->result
= CODING_FINISH_NORMAL
;
1260 /* We produce no character or one character. */
1261 switch (iso_code_class
[c1
])
1263 case ISO_0x20_or_0x7F
:
1264 if (COMPOSING_P (coding
) && coding
->composition_rule_follows
)
1266 DECODE_COMPOSITION_RULE (c1
);
1269 if (charset0
< 0 || CHARSET_CHARS (charset0
) == 94)
1271 /* This is SPACE or DEL. */
1272 charset
= CHARSET_ASCII
;
1275 /* This is a graphic character, we fall down ... */
1277 case ISO_graphic_plane_0
:
1278 if (COMPOSING_P (coding
) && coding
->composition_rule_follows
)
1280 DECODE_COMPOSITION_RULE (c1
);
1286 case ISO_0xA0_or_0xFF
:
1287 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1288 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1289 goto label_invalid_code
;
1290 /* This is a graphic character, we fall down ... */
1292 case ISO_graphic_plane_1
:
1294 goto label_invalid_code
;
1299 if (COMPOSING_P (coding
))
1300 DECODE_COMPOSITION_END ('1');
1302 /* All ISO2022 control characters in this class have the
1303 same representation in Emacs internal format. */
1305 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1306 && (coding
->eol_type
== CODING_EOL_CR
1307 || coding
->eol_type
== CODING_EOL_CRLF
))
1309 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
1310 goto label_end_of_loop
;
1312 charset
= CHARSET_ASCII
;
1316 if (COMPOSING_P (coding
))
1317 DECODE_COMPOSITION_END ('1');
1318 goto label_invalid_code
;
1320 case ISO_carriage_return
:
1321 if (COMPOSING_P (coding
))
1322 DECODE_COMPOSITION_END ('1');
1324 if (coding
->eol_type
== CODING_EOL_CR
)
1326 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1329 if (c1
!= ISO_CODE_LF
)
1331 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1333 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
1334 goto label_end_of_loop
;
1340 charset
= CHARSET_ASCII
;
1344 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1345 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1346 goto label_invalid_code
;
1347 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1348 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1352 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1353 goto label_invalid_code
;
1354 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1355 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1358 case ISO_single_shift_2_7
:
1359 case ISO_single_shift_2
:
1360 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1361 goto label_invalid_code
;
1362 /* SS2 is handled as an escape sequence of ESC 'N' */
1364 goto label_escape_sequence
;
1366 case ISO_single_shift_3
:
1367 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1368 goto label_invalid_code
;
1369 /* SS2 is handled as an escape sequence of ESC 'O' */
1371 goto label_escape_sequence
;
1373 case ISO_control_sequence_introducer
:
1374 /* CSI is handled as an escape sequence of ESC '[' ... */
1376 goto label_escape_sequence
;
1380 label_escape_sequence
:
1381 /* Escape sequences handled by Emacs are invocation,
1382 designation, direction specification, and character
1383 composition specification. */
1386 case '&': /* revision of following character set */
1388 if (!(c1
>= '@' && c1
<= '~'))
1389 goto label_invalid_code
;
1391 if (c1
!= ISO_CODE_ESC
)
1392 goto label_invalid_code
;
1394 goto label_escape_sequence
;
1396 case '$': /* designation of 2-byte character set */
1397 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1398 goto label_invalid_code
;
1400 if (c1
>= '@' && c1
<= 'B')
1401 { /* designation of JISX0208.1978, GB2312.1980,
1403 DECODE_DESIGNATION (0, 2, 94, c1
);
1405 else if (c1
>= 0x28 && c1
<= 0x2B)
1406 { /* designation of DIMENSION2_CHARS94 character set */
1408 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1410 else if (c1
>= 0x2C && c1
<= 0x2F)
1411 { /* designation of DIMENSION2_CHARS96 character set */
1413 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1416 goto label_invalid_code
;
1417 /* We must update these variables now. */
1418 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1419 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1422 case 'n': /* invocation of locking-shift-2 */
1423 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1424 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1425 goto label_invalid_code
;
1426 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1427 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1430 case 'o': /* invocation of locking-shift-3 */
1431 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1432 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1433 goto label_invalid_code
;
1434 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1435 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1438 case 'N': /* invocation of single-shift-2 */
1439 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1440 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1441 goto label_invalid_code
;
1442 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1446 case 'O': /* invocation of single-shift-3 */
1447 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1448 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1449 goto label_invalid_code
;
1450 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1454 case '0': case '2': case '3': case '4': /* start composition */
1455 DECODE_COMPOSITION_START (c1
);
1458 case '1': /* end composition */
1459 DECODE_COMPOSITION_END (c1
);
1462 case '[': /* specification of direction */
1463 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1464 goto label_invalid_code
;
1465 /* For the moment, nested direction is not supported.
1466 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1467 left-to-right, and nozero means right-to-left. */
1471 case ']': /* end of the current direction */
1472 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1474 case '0': /* end of the current direction */
1475 case '1': /* start of left-to-right direction */
1478 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1480 goto label_invalid_code
;
1483 case '2': /* start of right-to-left direction */
1486 coding
->mode
|= CODING_MODE_DIRECTION
;
1488 goto label_invalid_code
;
1492 goto label_invalid_code
;
1497 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1498 goto label_invalid_code
;
1499 if (c1
>= 0x28 && c1
<= 0x2B)
1500 { /* designation of DIMENSION1_CHARS94 character set */
1502 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1504 else if (c1
>= 0x2C && c1
<= 0x2F)
1505 { /* designation of DIMENSION1_CHARS96 character set */
1507 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1510 goto label_invalid_code
;
1511 /* We must update these variables now. */
1512 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1513 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1518 /* Now we know CHARSET and 1st position code C1 of a character.
1519 Produce a multibyte sequence for that character while getting
1520 2nd position code C2 if necessary. */
1521 if (CHARSET_DIMENSION (charset
) == 2)
1524 if (c1
< 0x80 ? c2
< 0x20 || c2
>= 0x80 : c2
< 0xA0)
1525 /* C2 is not in a valid range. */
1526 goto label_invalid_code
;
1528 c
= DECODE_ISO_CHARACTER (charset
, c1
, c2
);
1534 if (COMPOSING_P (coding
))
1535 DECODE_COMPOSITION_END ('1');
1542 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
1543 coding
->produced
= dst
- destination
;
1548 /* ISO2022 encoding stuff. */
1551 It is not enough to say just "ISO2022" on encoding, we have to
1552 specify more details. In Emacs, each coding system of ISO2022
1553 variant has the following specifications:
1554 1. Initial designation to G0 thru G3.
1555 2. Allows short-form designation?
1556 3. ASCII should be designated to G0 before control characters?
1557 4. ASCII should be designated to G0 at end of line?
1558 5. 7-bit environment or 8-bit environment?
1559 6. Use locking-shift?
1560 7. Use Single-shift?
1561 And the following two are only for Japanese:
1562 8. Use ASCII in place of JIS0201-1976-Roman?
1563 9. Use JISX0208-1983 in place of JISX0208-1978?
1564 These specifications are encoded in `coding->flags' as flag bits
1565 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1569 /* Produce codes (escape sequence) for designating CHARSET to graphic
1570 register REG at DST, and increment DST. If <final-char> of CHARSET is
1571 '@', 'A', or 'B' and the coding system CODING allows, produce
1572 designation sequence of short-form. */
1574 #define ENCODE_DESIGNATION(charset, reg, coding) \
1576 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1577 char *intermediate_char_94 = "()*+"; \
1578 char *intermediate_char_96 = ",-./"; \
1579 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1581 if (revision < 255) \
1583 *dst++ = ISO_CODE_ESC; \
1585 *dst++ = '@' + revision; \
1587 *dst++ = ISO_CODE_ESC; \
1588 if (CHARSET_DIMENSION (charset) == 1) \
1590 if (CHARSET_CHARS (charset) == 94) \
1591 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1593 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1598 if (CHARSET_CHARS (charset) == 94) \
1600 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1602 || final_char < '@' || final_char > 'B') \
1603 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1606 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1608 *dst++ = final_char; \
1609 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1612 /* The following two macros produce codes (control character or escape
1613 sequence) for ISO2022 single-shift functions (single-shift-2 and
1616 #define ENCODE_SINGLE_SHIFT_2 \
1618 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1619 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1621 *dst++ = ISO_CODE_SS2; \
1622 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1625 #define ENCODE_SINGLE_SHIFT_3 \
1627 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1628 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1630 *dst++ = ISO_CODE_SS3; \
1631 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1634 /* The following four macros produce codes (control character or
1635 escape sequence) for ISO2022 locking-shift functions (shift-in,
1636 shift-out, locking-shift-2, and locking-shift-3). */
1638 #define ENCODE_SHIFT_IN \
1640 *dst++ = ISO_CODE_SI; \
1641 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1644 #define ENCODE_SHIFT_OUT \
1646 *dst++ = ISO_CODE_SO; \
1647 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1650 #define ENCODE_LOCKING_SHIFT_2 \
1652 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1653 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1656 #define ENCODE_LOCKING_SHIFT_3 \
1658 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1659 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1662 /* Produce codes for a DIMENSION1 character whose character set is
1663 CHARSET and whose position-code is C1. Designation and invocation
1664 sequences are also produced in advance if necessary. */
1666 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1668 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1670 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1671 *dst++ = c1 & 0x7F; \
1673 *dst++ = c1 | 0x80; \
1674 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1677 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1679 *dst++ = c1 & 0x7F; \
1682 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1684 *dst++ = c1 | 0x80; \
1687 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1688 && !coding->safe_charsets[charset]) \
1690 /* We should not encode this character, instead produce one or \
1692 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1693 if (CHARSET_WIDTH (charset) == 2) \
1694 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1698 /* Since CHARSET is not yet invoked to any graphic planes, we \
1699 must invoke it, or, at first, designate it to some graphic \
1700 register. Then repeat the loop to actually produce the \
1702 dst = encode_invocation_designation (charset, coding, dst); \
1705 /* Produce codes for a DIMENSION2 character whose character set is
1706 CHARSET and whose position-codes are C1 and C2. Designation and
1707 invocation codes are also produced in advance if necessary. */
1709 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1711 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1713 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1714 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1716 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1717 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1720 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1722 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1725 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1727 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1730 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1731 && !coding->safe_charsets[charset]) \
1733 /* We should not encode this character, instead produce one or \
1735 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1736 if (CHARSET_WIDTH (charset) == 2) \
1737 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1741 /* Since CHARSET is not yet invoked to any graphic planes, we \
1742 must invoke it, or, at first, designate it to some graphic \
1743 register. Then repeat the loop to actually produce the \
1745 dst = encode_invocation_designation (charset, coding, dst); \
1748 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1750 int alt_charset = charset; \
1752 if (CHARSET_DEFINED_P (charset)) \
1754 if (CHARSET_DIMENSION (charset) == 1) \
1756 if (charset == CHARSET_ASCII \
1757 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1758 alt_charset = charset_latin_jisx0201; \
1759 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1763 if (charset == charset_jisx0208 \
1764 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1765 alt_charset = charset_jisx0208_1978; \
1766 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1777 /* Produce designation and invocation codes at a place pointed by DST
1778 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1782 encode_invocation_designation (charset
, coding
, dst
)
1784 struct coding_system
*coding
;
1787 int reg
; /* graphic register number */
1789 /* At first, check designations. */
1790 for (reg
= 0; reg
< 4; reg
++)
1791 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1796 /* CHARSET is not yet designated to any graphic registers. */
1797 /* At first check the requested designation. */
1798 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1799 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1800 /* Since CHARSET requests no special designation, designate it
1801 to graphic register 0. */
1804 ENCODE_DESIGNATION (charset
, reg
, coding
);
1807 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1808 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1810 /* Since the graphic register REG is not invoked to any graphic
1811 planes, invoke it to graphic plane 0. */
1814 case 0: /* graphic register 0 */
1818 case 1: /* graphic register 1 */
1822 case 2: /* graphic register 2 */
1823 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1824 ENCODE_SINGLE_SHIFT_2
;
1826 ENCODE_LOCKING_SHIFT_2
;
1829 case 3: /* graphic register 3 */
1830 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1831 ENCODE_SINGLE_SHIFT_3
;
1833 ENCODE_LOCKING_SHIFT_3
;
1841 /* Produce 2-byte codes for encoded composition rule RULE. */
1843 #define ENCODE_COMPOSITION_RULE(rule) \
1846 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1847 *dst++ = 32 + 81 + gref; \
1848 *dst++ = 32 + nref; \
1851 /* Produce codes for indicating the start of a composition sequence
1852 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1853 which specify information about the composition. See the comment
1854 in coding.h for the format of DATA. */
1856 #define ENCODE_COMPOSITION_START(coding, data) \
1858 coding->composing = data[3]; \
1859 *dst++ = ISO_CODE_ESC; \
1860 if (coding->composing == COMPOSITION_RELATIVE) \
1864 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1866 coding->cmp_data_index = coding->cmp_data_start + 4; \
1867 coding->composition_rule_follows = 0; \
1871 /* Produce codes for indicating the end of the current composition. */
1873 #define ENCODE_COMPOSITION_END(coding, data) \
1875 *dst++ = ISO_CODE_ESC; \
1877 coding->cmp_data_start += data[0]; \
1878 coding->composing = COMPOSITION_NO; \
1879 if (coding->cmp_data_start == coding->cmp_data->used \
1880 && coding->cmp_data->next) \
1882 coding->cmp_data = coding->cmp_data->next; \
1883 coding->cmp_data_start = 0; \
1887 /* Produce composition start sequence ESC 0. Here, this sequence
1888 doesn't mean the start of a new composition but means that we have
1889 just produced components (alternate chars and composition rules) of
1890 the composition and the actual text follows in SRC. */
1892 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1894 *dst++ = ISO_CODE_ESC; \
1896 coding->composing = COMPOSITION_RELATIVE; \
1899 /* The following three macros produce codes for indicating direction
1901 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1903 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1904 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1906 *dst++ = ISO_CODE_CSI; \
1909 #define ENCODE_DIRECTION_R2L \
1910 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1912 #define ENCODE_DIRECTION_L2R \
1913 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1915 /* Produce codes for designation and invocation to reset the graphic
1916 planes and registers to initial state. */
1917 #define ENCODE_RESET_PLANE_AND_REGISTER \
1920 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1922 for (reg = 0; reg < 4; reg++) \
1923 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1924 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1925 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1926 ENCODE_DESIGNATION \
1927 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1930 /* Produce designation sequences of charsets in the line started from
1931 SRC to a place pointed by DST, and return updated DST.
1933 If the current block ends before any end-of-line, we may fail to
1934 find all the necessary designations. */
1936 static unsigned char *
1937 encode_designation_at_bol (coding
, translation_table
, src
, src_end
, dst
)
1938 struct coding_system
*coding
;
1939 Lisp_Object translation_table
;
1940 unsigned char *src
, *src_end
, *dst
;
1942 int charset
, c
, found
= 0, reg
;
1943 /* Table of charsets to be designated to each graphic register. */
1946 for (reg
= 0; reg
< 4; reg
++)
1955 charset
= CHAR_CHARSET (c
);
1956 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1957 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1967 for (reg
= 0; reg
< 4; reg
++)
1969 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1970 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1976 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1979 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1980 struct coding_system
*coding
;
1981 unsigned char *source
, *destination
;
1982 int src_bytes
, dst_bytes
;
1984 unsigned char *src
= source
;
1985 unsigned char *src_end
= source
+ src_bytes
;
1986 unsigned char *dst
= destination
;
1987 unsigned char *dst_end
= destination
+ dst_bytes
;
1988 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1989 from DST_END to assure overflow checking is necessary only at the
1991 unsigned char *adjusted_dst_end
= dst_end
- 19;
1992 /* SRC_BASE remembers the start position in source in each loop.
1993 The loop will be exited when there's not enough source text to
1994 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1995 there's not enough destination area to produce encoded codes
1996 (within macro EMIT_BYTES). */
1997 unsigned char *src_base
;
1999 Lisp_Object translation_table
;
2001 if (NILP (Venable_character_translation
))
2002 translation_table
= Qnil
;
2005 translation_table
= coding
->translation_table_for_encode
;
2006 if (NILP (translation_table
))
2007 translation_table
= Vstandard_translation_table_for_encode
;
2010 coding
->consumed_char
= 0;
2014 int charset
, c1
, c2
;
2018 if (dst
>= (dst_bytes
? adjusted_dst_end
: (src
- 19)))
2020 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
2024 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
2025 && CODING_SPEC_ISO_BOL (coding
))
2027 /* We have to produce designation sequences if any now. */
2028 dst
= encode_designation_at_bol (coding
, translation_table
,
2030 CODING_SPEC_ISO_BOL (coding
) = 0;
2033 /* Check composition start and end. */
2034 if (coding
->composing
!= COMPOSITION_DISABLED
2035 && coding
->cmp_data_start
< coding
->cmp_data
->used
)
2037 struct composition_data
*cmp_data
= coding
->cmp_data
;
2038 int *data
= cmp_data
->data
+ coding
->cmp_data_start
;
2039 int this_pos
= cmp_data
->char_offset
+ coding
->consumed_char
;
2041 if (coding
->composing
== COMPOSITION_RELATIVE
)
2043 if (this_pos
== data
[2])
2045 ENCODE_COMPOSITION_END (coding
, data
);
2046 cmp_data
= coding
->cmp_data
;
2047 data
= cmp_data
->data
+ coding
->cmp_data_start
;
2050 else if (COMPOSING_P (coding
))
2052 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2053 if (coding
->cmp_data_index
== coding
->cmp_data_start
+ data
[0])
2054 /* We have consumed components of the composition.
2055 What follows in SRC is the compositions's base
2057 ENCODE_COMPOSITION_FAKE_START (coding
);
2060 int c
= cmp_data
->data
[coding
->cmp_data_index
++];
2061 if (coding
->composition_rule_follows
)
2063 ENCODE_COMPOSITION_RULE (c
);
2064 coding
->composition_rule_follows
= 0;
2068 SPLIT_CHAR (c
, charset
, c1
, c2
);
2069 ENCODE_ISO_CHARACTER (charset
, c1
, c2
);
2070 if (coding
->composing
== COMPOSITION_WITH_RULE_ALTCHARS
)
2071 coding
->composition_rule_follows
= 1;
2076 if (!COMPOSING_P (coding
))
2078 if (this_pos
== data
[1])
2080 ENCODE_COMPOSITION_START (coding
, data
);
2088 /* Now encode the character C. */
2089 if (c
< 0x20 || c
== 0x7F)
2093 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2095 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
2096 ENCODE_RESET_PLANE_AND_REGISTER
;
2100 /* fall down to treat '\r' as '\n' ... */
2105 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
2106 ENCODE_RESET_PLANE_AND_REGISTER
;
2107 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
2108 bcopy (coding
->spec
.iso2022
.initial_designation
,
2109 coding
->spec
.iso2022
.current_designation
,
2110 sizeof coding
->spec
.iso2022
.initial_designation
);
2111 if (coding
->eol_type
== CODING_EOL_LF
2112 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2113 *dst
++ = ISO_CODE_LF
;
2114 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2115 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
2117 *dst
++ = ISO_CODE_CR
;
2118 CODING_SPEC_ISO_BOL (coding
) = 1;
2122 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
2123 ENCODE_RESET_PLANE_AND_REGISTER
;
2127 else if (ASCII_BYTE_P (c
))
2128 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c
, /* dummy */ c1
);
2129 else if (SINGLE_BYTE_CHAR_P (c
))
2136 SPLIT_CHAR (c
, charset
, c1
, c2
);
2137 ENCODE_ISO_CHARACTER (charset
, c1
, c2
);
2140 coding
->consumed_char
++;
2144 coding
->consumed
= src_base
- source
;
2145 coding
->produced
= coding
->produced_char
= dst
- destination
;
2149 /*** 4. SJIS and BIG5 handlers ***/
2151 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2152 quite widely. So, for the moment, Emacs supports them in the bare
2153 C code. But, in the future, they may be supported only by CCL. */
2155 /* SJIS is a coding system encoding three character sets: ASCII, right
2156 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2157 as is. A character of charset katakana-jisx0201 is encoded by
2158 "position-code + 0x80". A character of charset japanese-jisx0208
2159 is encoded in 2-byte but two position-codes are divided and shifted
2160 so that it fit in the range below.
2162 --- CODE RANGE of SJIS ---
2163 (character set) (range)
2165 KATAKANA-JISX0201 0xA0 .. 0xDF
2166 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2167 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2168 -------------------------------
2172 /* BIG5 is a coding system encoding two character sets: ASCII and
2173 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2174 character set and is encoded in two-byte.
2176 --- CODE RANGE of BIG5 ---
2177 (character set) (range)
2179 Big5 (1st byte) 0xA1 .. 0xFE
2180 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2181 --------------------------
2183 Since the number of characters in Big5 is larger than maximum
2184 characters in Emacs' charset (96x96), it can't be handled as one
2185 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2186 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2187 contains frequently used characters and the latter contains less
2188 frequently used characters. */
2190 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2191 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2192 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2193 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2195 /* Number of Big5 characters which have the same code in 1st byte. */
2196 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2198 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2201 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2203 charset = charset_big5_1; \
2206 charset = charset_big5_2; \
2207 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2209 c1 = temp / (0xFF - 0xA1) + 0x21; \
2210 c2 = temp % (0xFF - 0xA1) + 0x21; \
2213 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2215 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2216 if (charset == charset_big5_2) \
2217 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2218 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2219 b2 = temp % BIG5_SAME_ROW; \
2220 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2223 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2224 Check if a text is encoded in SJIS. If it is, return
2225 CODING_CATEGORY_MASK_SJIS, else return 0. */
2228 detect_coding_sjis (src
, src_end
)
2229 unsigned char *src
, *src_end
;
2232 /* Dummy for ONE_MORE_BYTE. */
2233 struct coding_system dummy_coding
;
2234 struct coding_system
*coding
= &dummy_coding
;
2239 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2247 return CODING_CATEGORY_MASK_SJIS
;
2250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2251 Check if a text is encoded in BIG5. If it is, return
2252 CODING_CATEGORY_MASK_BIG5, else return 0. */
2255 detect_coding_big5 (src
, src_end
)
2256 unsigned char *src
, *src_end
;
2259 /* Dummy for ONE_MORE_BYTE. */
2260 struct coding_system dummy_coding
;
2261 struct coding_system
*coding
= &dummy_coding
;
2269 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2274 return CODING_CATEGORY_MASK_BIG5
;
2277 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2278 Check if a text is encoded in UTF-8. If it is, return
2279 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2281 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2282 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2283 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2284 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2285 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2286 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2287 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2290 detect_coding_utf_8 (src
, src_end
)
2291 unsigned char *src
, *src_end
;
2294 int seq_maybe_bytes
;
2295 /* Dummy for ONE_MORE_BYTE. */
2296 struct coding_system dummy_coding
;
2297 struct coding_system
*coding
= &dummy_coding
;
2302 if (UTF_8_1_OCTET_P (c
))
2304 else if (UTF_8_2_OCTET_LEADING_P (c
))
2305 seq_maybe_bytes
= 1;
2306 else if (UTF_8_3_OCTET_LEADING_P (c
))
2307 seq_maybe_bytes
= 2;
2308 else if (UTF_8_4_OCTET_LEADING_P (c
))
2309 seq_maybe_bytes
= 3;
2310 else if (UTF_8_5_OCTET_LEADING_P (c
))
2311 seq_maybe_bytes
= 4;
2312 else if (UTF_8_6_OCTET_LEADING_P (c
))
2313 seq_maybe_bytes
= 5;
2320 if (!UTF_8_EXTRA_OCTET_P (c
))
2324 while (seq_maybe_bytes
> 0);
2328 return CODING_CATEGORY_MASK_UTF_8
;
2331 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2332 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2333 Little Endian (otherwise). If it is, return
2334 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2337 #define UTF_16_INVALID_P(val) \
2338 (((val) == 0xFFFE) \
2339 || ((val) == 0xFFFF))
2341 #define UTF_16_HIGH_SURROGATE_P(val) \
2342 (((val) & 0xD800) == 0xD800)
2344 #define UTF_16_LOW_SURROGATE_P(val) \
2345 (((val) & 0xDC00) == 0xDC00)
2348 detect_coding_utf_16 (src
, src_end
)
2349 unsigned char *src
, *src_end
;
2351 unsigned char c1
, c2
;
2352 /* Dummy for TWO_MORE_BYTES. */
2353 struct coding_system dummy_coding
;
2354 struct coding_system
*coding
= &dummy_coding
;
2356 TWO_MORE_BYTES (c1
, c2
);
2358 if ((c1
== 0xFF) && (c2
== 0xFE))
2359 return CODING_CATEGORY_MASK_UTF_16_LE
;
2360 else if ((c1
== 0xFE) && (c2
== 0xFF))
2361 return CODING_CATEGORY_MASK_UTF_16_BE
;
2367 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2368 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2371 decode_coding_sjis_big5 (coding
, source
, destination
,
2372 src_bytes
, dst_bytes
, sjis_p
)
2373 struct coding_system
*coding
;
2374 unsigned char *source
, *destination
;
2375 int src_bytes
, dst_bytes
;
2378 unsigned char *src
= source
;
2379 unsigned char *src_end
= source
+ src_bytes
;
2380 unsigned char *dst
= destination
;
2381 unsigned char *dst_end
= destination
+ dst_bytes
;
2382 /* SRC_BASE remembers the start position in source in each loop.
2383 The loop will be exited when there's not enough source code
2384 (within macro ONE_MORE_BYTE), or when there's not enough
2385 destination area to produce a character (within macro
2387 unsigned char *src_base
;
2388 Lisp_Object translation_table
;
2390 if (NILP (Venable_character_translation
))
2391 translation_table
= Qnil
;
2394 translation_table
= coding
->translation_table_for_decode
;
2395 if (NILP (translation_table
))
2396 translation_table
= Vstandard_translation_table_for_decode
;
2399 coding
->produced_char
= 0;
2402 int c
, charset
, c1
, c2
;
2409 charset
= CHARSET_ASCII
;
2414 if (coding
->eol_type
== CODING_EOL_CRLF
)
2419 else if (coding
->mode
2420 & CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2422 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2423 goto label_end_of_loop
;
2426 /* To process C2 again, SRC is subtracted by 1. */
2429 else if (coding
->eol_type
== CODING_EOL_CR
)
2433 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2434 && (coding
->eol_type
== CODING_EOL_CR
2435 || coding
->eol_type
== CODING_EOL_CRLF
))
2437 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2438 goto label_end_of_loop
;
2447 goto label_invalid_code
;
2448 if (c1
< 0xA0 || c1
>= 0xE0)
2450 /* SJIS -> JISX0208 */
2452 if (c2
< 0x40 || c2
== 0x7F || c2
> 0xFC)
2453 goto label_invalid_code
;
2454 DECODE_SJIS (c1
, c2
, c1
, c2
);
2455 charset
= charset_jisx0208
;
2458 /* SJIS -> JISX0201-Kana */
2459 charset
= charset_katakana_jisx0201
;
2464 if (c1
< 0xA1 || c1
> 0xFE)
2465 goto label_invalid_code
;
2467 if (c2
< 0x40 || (c2
> 0x7E && c2
< 0xA1) || c2
> 0xFE)
2468 goto label_invalid_code
;
2469 DECODE_BIG5 (c1
, c2
, charset
, c1
, c2
);
2473 c
= DECODE_ISO_CHARACTER (charset
, c1
, c2
);
2485 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
2486 coding
->produced
= dst
- destination
;
2490 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2491 This function can encode charsets `ascii', `katakana-jisx0201',
2492 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2493 are sure that all these charsets are registered as official charset
2494 (i.e. do not have extended leading-codes). Characters of other
2495 charsets are produced without any encoding. If SJIS_P is 1, encode
2496 SJIS text, else encode BIG5 text. */
2499 encode_coding_sjis_big5 (coding
, source
, destination
,
2500 src_bytes
, dst_bytes
, sjis_p
)
2501 struct coding_system
*coding
;
2502 unsigned char *source
, *destination
;
2503 int src_bytes
, dst_bytes
;
2506 unsigned char *src
= source
;
2507 unsigned char *src_end
= source
+ src_bytes
;
2508 unsigned char *dst
= destination
;
2509 unsigned char *dst_end
= destination
+ dst_bytes
;
2510 /* SRC_BASE remembers the start position in source in each loop.
2511 The loop will be exited when there's not enough source text to
2512 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2513 there's not enough destination area to produce encoded codes
2514 (within macro EMIT_BYTES). */
2515 unsigned char *src_base
;
2516 Lisp_Object translation_table
;
2518 if (NILP (Venable_character_translation
))
2519 translation_table
= Qnil
;
2522 translation_table
= coding
->translation_table_for_decode
;
2523 if (NILP (translation_table
))
2524 translation_table
= Vstandard_translation_table_for_decode
;
2529 int c
, charset
, c1
, c2
;
2534 /* Now encode the character C. */
2535 if (SINGLE_BYTE_CHAR_P (c
))
2540 if (!coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2547 if (coding
->eol_type
== CODING_EOL_CRLF
)
2549 EMIT_TWO_BYTES ('\r', c
);
2552 else if (coding
->eol_type
== CODING_EOL_CR
)
2560 SPLIT_CHAR (c
, charset
, c1
, c2
);
2563 if (charset
== charset_jisx0208
2564 || charset
== charset_jisx0208_1978
)
2566 ENCODE_SJIS (c1
, c2
, c1
, c2
);
2567 EMIT_TWO_BYTES (c1
, c2
);
2569 else if (charset
== charset_latin_jisx0201
)
2572 /* There's no way other than producing the internal
2574 EMIT_BYTES (src_base
, src
);
2578 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
2580 ENCODE_BIG5 (charset
, c1
, c2
, c1
, c2
);
2581 EMIT_TWO_BYTES (c1
, c2
);
2584 /* There's no way other than producing the internal
2586 EMIT_BYTES (src_base
, src
);
2589 coding
->consumed_char
++;
2593 coding
->consumed
= src_base
- source
;
2594 coding
->produced
= coding
->produced_char
= dst
- destination
;
2598 /*** 5. CCL handlers ***/
2600 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2601 Check if a text is encoded in a coding system of which
2602 encoder/decoder are written in CCL program. If it is, return
2603 CODING_CATEGORY_MASK_CCL, else return 0. */
2606 detect_coding_ccl (src
, src_end
)
2607 unsigned char *src
, *src_end
;
2609 unsigned char *valid
;
2611 /* Dummy for ONE_MORE_BYTE. */
2612 struct coding_system dummy_coding
;
2613 struct coding_system
*coding
= &dummy_coding
;
2615 /* No coding system is assigned to coding-category-ccl. */
2616 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2619 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2627 return CODING_CATEGORY_MASK_CCL
;
2631 /*** 6. End-of-line handlers ***/
2633 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2636 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2637 struct coding_system
*coding
;
2638 unsigned char *source
, *destination
;
2639 int src_bytes
, dst_bytes
;
2641 unsigned char *src
= source
;
2642 unsigned char *dst
= destination
;
2643 unsigned char *src_end
= src
+ src_bytes
;
2644 unsigned char *dst_end
= dst
+ dst_bytes
;
2645 Lisp_Object translation_table
;
2646 /* SRC_BASE remembers the start position in source in each loop.
2647 The loop will be exited when there's not enough source code
2648 (within macro ONE_MORE_BYTE), or when there's not enough
2649 destination area to produce a character (within macro
2651 unsigned char *src_base
;
2654 translation_table
= Qnil
;
2655 switch (coding
->eol_type
)
2657 case CODING_EOL_CRLF
:
2667 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2669 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2670 goto label_end_of_loop
;
2677 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2679 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2680 goto label_end_of_loop
;
2693 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2695 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2696 goto label_end_of_loop
;
2705 default: /* no need for EOL handling */
2715 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
2716 coding
->produced
= dst
- destination
;
2720 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2721 format of end-of-line according to `coding->eol_type'. It also
2722 convert multibyte form 8-bit characers to unibyte if
2723 CODING->src_multibyte is nonzero. If `coding->mode &
2724 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2725 also means end-of-line. */
2728 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2729 struct coding_system
*coding
;
2730 unsigned char *source
, *destination
;
2731 int src_bytes
, dst_bytes
;
2733 unsigned char *src
= source
;
2734 unsigned char *dst
= destination
;
2735 unsigned char *src_end
= src
+ src_bytes
;
2736 unsigned char *dst_end
= dst
+ dst_bytes
;
2737 Lisp_Object translation_table
;
2738 /* SRC_BASE remembers the start position in source in each loop.
2739 The loop will be exited when there's not enough source text to
2740 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2741 there's not enough destination area to produce encoded codes
2742 (within macro EMIT_BYTES). */
2743 unsigned char *src_base
;
2745 int selective_display
= coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
;
2747 translation_table
= Qnil
;
2748 if (coding
->src_multibyte
2749 && *(src_end
- 1) == LEADING_CODE_8_BIT_CONTROL
)
2753 coding
->result
= CODING_FINISH_INSUFFICIENT_SRC
;
2756 if (coding
->eol_type
== CODING_EOL_CRLF
)
2758 while (src
< src_end
)
2764 else if (c
== '\n' || (c
== '\r' && selective_display
))
2765 EMIT_TWO_BYTES ('\r', '\n');
2775 if (src_bytes
<= dst_bytes
)
2777 safe_bcopy (src
, dst
, src_bytes
);
2783 if (coding
->src_multibyte
2784 && *(src
+ dst_bytes
- 1) == LEADING_CODE_8_BIT_CONTROL
)
2786 safe_bcopy (src
, dst
, dst_bytes
);
2787 src_base
= src
+ dst_bytes
;
2788 dst
= destination
+ dst_bytes
;
2789 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
2791 if (coding
->eol_type
== CODING_EOL_CR
)
2793 for (src
= destination
; src
< dst
; src
++)
2794 if (*src
== '\n') *src
= '\r';
2796 else if (selective_display
)
2798 for (src
= destination
; src
< dst
; src
++)
2799 if (*src
== '\r') *src
= '\n';
2802 if (coding
->src_multibyte
)
2803 dst
= destination
+ str_as_unibyte (destination
, dst
- destination
);
2805 coding
->consumed
= src_base
- source
;
2806 coding
->produced
= dst
- destination
;
2810 /*** 7. C library functions ***/
2812 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2813 has a property `coding-system'. The value of this property is a
2814 vector of length 5 (called as coding-vector). Among elements of
2815 this vector, the first (element[0]) and the fifth (element[4])
2816 carry important information for decoding/encoding. Before
2817 decoding/encoding, this information should be set in fields of a
2818 structure of type `coding_system'.
2820 A value of property `coding-system' can be a symbol of another
2821 subsidiary coding-system. In that case, Emacs gets coding-vector
2824 `element[0]' contains information to be set in `coding->type'. The
2825 value and its meaning is as follows:
2827 0 -- coding_type_emacs_mule
2828 1 -- coding_type_sjis
2829 2 -- coding_type_iso2022
2830 3 -- coding_type_big5
2831 4 -- coding_type_ccl encoder/decoder written in CCL
2832 nil -- coding_type_no_conversion
2833 t -- coding_type_undecided (automatic conversion on decoding,
2834 no-conversion on encoding)
2836 `element[4]' contains information to be set in `coding->flags' and
2837 `coding->spec'. The meaning varies by `coding->type'.
2839 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2840 of length 32 (of which the first 13 sub-elements are used now).
2841 Meanings of these sub-elements are:
2843 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2844 If the value is an integer of valid charset, the charset is
2845 assumed to be designated to graphic register N initially.
2847 If the value is minus, it is a minus value of charset which
2848 reserves graphic register N, which means that the charset is
2849 not designated initially but should be designated to graphic
2850 register N just before encoding a character in that charset.
2852 If the value is nil, graphic register N is never used on
2855 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2856 Each value takes t or nil. See the section ISO2022 of
2857 `coding.h' for more information.
2859 If `coding->type' is `coding_type_big5', element[4] is t to denote
2860 BIG5-ETen or nil to denote BIG5-HKU.
2862 If `coding->type' takes the other value, element[4] is ignored.
2864 Emacs Lisp's coding system also carries information about format of
2865 end-of-line in a value of property `eol-type'. If the value is
2866 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2867 means CODING_EOL_CR. If it is not integer, it should be a vector
2868 of subsidiary coding systems of which property `eol-type' has one
2873 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2874 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2875 is setup so that no conversion is necessary and return -1, else
2879 setup_coding_system (coding_system
, coding
)
2880 Lisp_Object coding_system
;
2881 struct coding_system
*coding
;
2883 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2887 /* Initialize some fields required for all kinds of coding systems. */
2888 coding
->symbol
= coding_system
;
2889 coding
->common_flags
= 0;
2891 coding
->heading_ascii
= -1;
2892 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2893 coding
->composing
= COMPOSITION_DISABLED
;
2894 coding
->cmp_data
= NULL
;
2896 if (NILP (coding_system
))
2897 goto label_invalid_coding_system
;
2899 coding_spec
= Fget (coding_system
, Qcoding_system
);
2901 if (!VECTORP (coding_spec
)
2902 || XVECTOR (coding_spec
)->size
!= 5
2903 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2904 goto label_invalid_coding_system
;
2906 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2907 if (VECTORP (eol_type
))
2909 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2910 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2912 else if (XFASTINT (eol_type
) == 1)
2914 coding
->eol_type
= CODING_EOL_CRLF
;
2915 coding
->common_flags
2916 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2918 else if (XFASTINT (eol_type
) == 2)
2920 coding
->eol_type
= CODING_EOL_CR
;
2921 coding
->common_flags
2922 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2925 coding
->eol_type
= CODING_EOL_LF
;
2927 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2928 /* Try short cut. */
2929 if (SYMBOLP (coding_type
))
2931 if (EQ (coding_type
, Qt
))
2933 coding
->type
= coding_type_undecided
;
2934 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2937 coding
->type
= coding_type_no_conversion
;
2941 /* Get values of coding system properties:
2942 `post-read-conversion', `pre-write-conversion',
2943 `translation-table-for-decode', `translation-table-for-encode'. */
2944 plist
= XVECTOR (coding_spec
)->contents
[3];
2945 /* Pre & post conversion functions should be disabled if
2946 inhibit_eol_conversion is nozero. This is the case that a code
2947 conversion function is called while those functions are running. */
2948 if (! inhibit_pre_post_conversion
)
2950 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2951 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2953 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2955 val
= Fget (val
, Qtranslation_table_for_decode
);
2956 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2957 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2959 val
= Fget (val
, Qtranslation_table_for_encode
);
2960 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2961 val
= Fplist_get (plist
, Qcoding_category
);
2964 val
= Fget (val
, Qcoding_category_index
);
2966 coding
->category_idx
= XINT (val
);
2968 goto label_invalid_coding_system
;
2971 goto label_invalid_coding_system
;
2973 val
= Fplist_get (plist
, Qsafe_charsets
);
2976 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2977 coding
->safe_charsets
[i
] = 1;
2981 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2984 if ((i
= get_charset_id (XCAR (val
))) >= 0)
2985 coding
->safe_charsets
[i
] = 1;
2990 /* If the coding system has non-nil `composition' property, enable
2991 composition handling. */
2992 val
= Fplist_get (plist
, Qcomposition
);
2994 coding
->composing
= COMPOSITION_NO
;
2996 switch (XFASTINT (coding_type
))
2999 coding
->type
= coding_type_emacs_mule
;
3000 if (!NILP (coding
->post_read_conversion
))
3001 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
3002 if (!NILP (coding
->pre_write_conversion
))
3003 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
3007 coding
->type
= coding_type_sjis
;
3008 coding
->common_flags
3009 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3013 coding
->type
= coding_type_iso2022
;
3014 coding
->common_flags
3015 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3017 Lisp_Object val
, temp
;
3019 int i
, charset
, reg_bits
= 0;
3021 val
= XVECTOR (coding_spec
)->contents
[4];
3023 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
3024 goto label_invalid_coding_system
;
3026 flags
= XVECTOR (val
)->contents
;
3028 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
3029 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
3030 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
3031 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
3032 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
3033 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
3034 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
3035 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
3036 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
3037 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
3038 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3039 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
3040 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
3043 /* Invoke graphic register 0 to plane 0. */
3044 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
3045 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3046 CODING_SPEC_ISO_INVOCATION (coding
, 1)
3047 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
3048 /* Not single shifting at first. */
3049 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
3050 /* Beginning of buffer should also be regarded as bol. */
3051 CODING_SPEC_ISO_BOL (coding
) = 1;
3053 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3054 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
3055 val
= Vcharset_revision_alist
;
3058 charset
= get_charset_id (Fcar_safe (XCAR (val
)));
3060 && (temp
= Fcdr_safe (XCAR (val
)), INTEGERP (temp
))
3061 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
3062 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
3066 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3067 FLAGS[REG] can be one of below:
3068 integer CHARSET: CHARSET occupies register I,
3069 t: designate nothing to REG initially, but can be used
3071 list of integer, nil, or t: designate the first
3072 element (if integer) to REG initially, the remaining
3073 elements (if integer) is designated to REG on request,
3074 if an element is t, REG can be used by any charsets,
3075 nil: REG is never used. */
3076 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3077 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3078 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3079 for (i
= 0; i
< 4; i
++)
3081 if (INTEGERP (flags
[i
])
3082 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3083 || (charset
= get_charset_id (flags
[i
])) >= 0)
3085 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3086 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3088 else if (EQ (flags
[i
], Qt
))
3090 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3092 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3094 else if (CONSP (flags
[i
]))
3099 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3100 if (INTEGERP (XCAR (tail
))
3101 && (charset
= XINT (XCAR (tail
)),
3102 CHARSET_VALID_P (charset
))
3103 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3105 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3106 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3109 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3111 while (CONSP (tail
))
3113 if (INTEGERP (XCAR (tail
))
3114 && (charset
= XINT (XCAR (tail
)),
3115 CHARSET_VALID_P (charset
))
3116 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3117 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3119 else if (EQ (XCAR (tail
), Qt
))
3125 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3127 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3128 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3131 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3133 /* REG 1 can be used only by locking shift in 7-bit env. */
3134 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3136 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3137 /* Without any shifting, only REG 0 and 1 can be used. */
3142 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3144 if (CHARSET_VALID_P (charset
))
3146 /* There exist some default graphic registers to be
3149 /* We had better avoid designating a charset of
3150 CHARS96 to REG 0 as far as possible. */
3151 if (CHARSET_CHARS (charset
) == 96)
3152 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3154 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3156 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3158 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3162 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3163 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3167 coding
->type
= coding_type_big5
;
3168 coding
->common_flags
3169 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3171 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3172 ? CODING_FLAG_BIG5_HKU
3173 : CODING_FLAG_BIG5_ETEN
);
3177 coding
->type
= coding_type_ccl
;
3178 coding
->common_flags
3179 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3181 val
= XVECTOR (coding_spec
)->contents
[4];
3183 || setup_ccl_program (&(coding
->spec
.ccl
.decoder
),
3185 || setup_ccl_program (&(coding
->spec
.ccl
.encoder
),
3187 goto label_invalid_coding_system
;
3189 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3190 val
= Fplist_get (plist
, Qvalid_codes
);
3195 for (; CONSP (val
); val
= XCDR (val
))
3199 && XINT (this) >= 0 && XINT (this) < 256)
3200 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3201 else if (CONSP (this)
3202 && INTEGERP (XCAR (this))
3203 && INTEGERP (XCDR (this)))
3205 int start
= XINT (XCAR (this));
3206 int end
= XINT (XCDR (this));
3208 if (start
>= 0 && start
<= end
&& end
< 256)
3209 while (start
<= end
)
3210 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3215 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3216 coding
->spec
.ccl
.cr_carryover
= 0;
3220 coding
->type
= coding_type_raw_text
;
3224 goto label_invalid_coding_system
;
3228 label_invalid_coding_system
:
3229 coding
->type
= coding_type_no_conversion
;
3230 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3231 coding
->common_flags
= 0;
3232 coding
->eol_type
= CODING_EOL_LF
;
3233 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3237 /* Free memory blocks allocated for storing composition information. */
3240 coding_free_composition_data (coding
)
3241 struct coding_system
*coding
;
3243 struct composition_data
*cmp_data
= coding
->cmp_data
, *next
;
3247 /* Memory blocks are chained. At first, rewind to the first, then,
3248 free blocks one by one. */
3249 while (cmp_data
->prev
)
3250 cmp_data
= cmp_data
->prev
;
3253 next
= cmp_data
->next
;
3257 coding
->cmp_data
= NULL
;
3260 /* Set `char_offset' member of all memory blocks pointed by
3261 coding->cmp_data to POS. */
3264 coding_adjust_composition_offset (coding
, pos
)
3265 struct coding_system
*coding
;
3268 struct composition_data
*cmp_data
;
3270 for (cmp_data
= coding
->cmp_data
; cmp_data
; cmp_data
= cmp_data
->next
)
3271 cmp_data
->char_offset
= pos
;
3274 /* Setup raw-text or one of its subsidiaries in the structure
3275 coding_system CODING according to the already setup value eol_type
3276 in CODING. CODING should be setup for some coding system in
3280 setup_raw_text_coding_system (coding
)
3281 struct coding_system
*coding
;
3283 if (coding
->type
!= coding_type_raw_text
)
3285 coding
->symbol
= Qraw_text
;
3286 coding
->type
= coding_type_raw_text
;
3287 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3289 Lisp_Object subsidiaries
;
3290 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3292 if (VECTORP (subsidiaries
)
3293 && XVECTOR (subsidiaries
)->size
== 3)
3295 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3297 setup_coding_system (coding
->symbol
, coding
);
3302 /* Emacs has a mechanism to automatically detect a coding system if it
3303 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3304 it's impossible to distinguish some coding systems accurately
3305 because they use the same range of codes. So, at first, coding
3306 systems are categorized into 7, those are:
3308 o coding-category-emacs-mule
3310 The category for a coding system which has the same code range
3311 as Emacs' internal format. Assigned the coding-system (Lisp
3312 symbol) `emacs-mule' by default.
3314 o coding-category-sjis
3316 The category for a coding system which has the same code range
3317 as SJIS. Assigned the coding-system (Lisp
3318 symbol) `japanese-shift-jis' by default.
3320 o coding-category-iso-7
3322 The category for a coding system which has the same code range
3323 as ISO2022 of 7-bit environment. This doesn't use any locking
3324 shift and single shift functions. This can encode/decode all
3325 charsets. Assigned the coding-system (Lisp symbol)
3326 `iso-2022-7bit' by default.
3328 o coding-category-iso-7-tight
3330 Same as coding-category-iso-7 except that this can
3331 encode/decode only the specified charsets.
3333 o coding-category-iso-8-1
3335 The category for a coding system which has the same code range
3336 as ISO2022 of 8-bit environment and graphic plane 1 used only
3337 for DIMENSION1 charset. This doesn't use any locking shift
3338 and single shift functions. Assigned the coding-system (Lisp
3339 symbol) `iso-latin-1' by default.
3341 o coding-category-iso-8-2
3343 The category for a coding system which has the same code range
3344 as ISO2022 of 8-bit environment and graphic plane 1 used only
3345 for DIMENSION2 charset. This doesn't use any locking shift
3346 and single shift functions. Assigned the coding-system (Lisp
3347 symbol) `japanese-iso-8bit' by default.
3349 o coding-category-iso-7-else
3351 The category for a coding system which has the same code range
3352 as ISO2022 of 7-bit environemnt but uses locking shift or
3353 single shift functions. Assigned the coding-system (Lisp
3354 symbol) `iso-2022-7bit-lock' by default.
3356 o coding-category-iso-8-else
3358 The category for a coding system which has the same code range
3359 as ISO2022 of 8-bit environemnt but uses locking shift or
3360 single shift functions. Assigned the coding-system (Lisp
3361 symbol) `iso-2022-8bit-ss2' by default.
3363 o coding-category-big5
3365 The category for a coding system which has the same code range
3366 as BIG5. Assigned the coding-system (Lisp symbol)
3367 `cn-big5' by default.
3369 o coding-category-utf-8
3371 The category for a coding system which has the same code range
3372 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3373 symbol) `utf-8' by default.
3375 o coding-category-utf-16-be
3377 The category for a coding system in which a text has an
3378 Unicode signature (cf. Unicode Standard) in the order of BIG
3379 endian at the head. Assigned the coding-system (Lisp symbol)
3380 `utf-16-be' by default.
3382 o coding-category-utf-16-le
3384 The category for a coding system in which a text has an
3385 Unicode signature (cf. Unicode Standard) in the order of
3386 LITTLE endian at the head. Assigned the coding-system (Lisp
3387 symbol) `utf-16-le' by default.
3389 o coding-category-ccl
3391 The category for a coding system of which encoder/decoder is
3392 written in CCL programs. The default value is nil, i.e., no
3393 coding system is assigned.
3395 o coding-category-binary
3397 The category for a coding system not categorized in any of the
3398 above. Assigned the coding-system (Lisp symbol)
3399 `no-conversion' by default.
3401 Each of them is a Lisp symbol and the value is an actual
3402 `coding-system's (this is also a Lisp symbol) assigned by a user.
3403 What Emacs does actually is to detect a category of coding system.
3404 Then, it uses a `coding-system' assigned to it. If Emacs can't
3405 decide only one possible category, it selects a category of the
3406 highest priority. Priorities of categories are also specified by a
3407 user in a Lisp variable `coding-category-list'.
3412 int ascii_skip_code
[256];
3414 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3415 If it detects possible coding systems, return an integer in which
3416 appropriate flag bits are set. Flag bits are defined by macros
3417 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3418 it should point the table `coding_priorities'. In that case, only
3419 the flag bit for a coding system of the highest priority is set in
3422 How many ASCII characters are at the head is returned as *SKIP. */
3425 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3426 unsigned char *source
;
3427 int src_bytes
, *priorities
, *skip
;
3429 register unsigned char c
;
3430 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3431 unsigned int mask
, utf16_examined_p
, iso2022_examined_p
;
3434 /* At first, skip all ASCII characters and control characters except
3435 for three ISO2022 specific control characters. */
3436 ascii_skip_code
[ISO_CODE_SO
] = 0;
3437 ascii_skip_code
[ISO_CODE_SI
] = 0;
3438 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3440 label_loop_detect_coding
:
3441 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3442 *skip
= src
- source
;
3445 /* We found nothing other than ASCII. There's nothing to do. */
3449 /* The text seems to be encoded in some multilingual coding system.
3450 Now, try to find in which coding system the text is encoded. */
3453 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3454 /* C is an ISO2022 specific control code of C0. */
3455 mask
= detect_coding_iso2022 (src
, src_end
);
3458 /* No valid ISO2022 code follows C. Try again. */
3460 if (c
== ISO_CODE_ESC
)
3461 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3463 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3464 goto label_loop_detect_coding
;
3468 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3470 if (mask
& priorities
[i
])
3471 return priorities
[i
];
3473 return CODING_CATEGORY_MASK_RAW_TEXT
;
3482 /* C is the first byte of SJIS character code,
3483 or a leading-code of Emacs' internal format (emacs-mule),
3484 or the first byte of UTF-16. */
3485 try = (CODING_CATEGORY_MASK_SJIS
3486 | CODING_CATEGORY_MASK_EMACS_MULE
3487 | CODING_CATEGORY_MASK_UTF_16_BE
3488 | CODING_CATEGORY_MASK_UTF_16_LE
);
3490 /* Or, if C is a special latin extra code,
3491 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3492 or is an ISO2022 control-sequence-introducer (CSI),
3493 we should also consider the possibility of ISO2022 codings. */
3494 if ((VECTORP (Vlatin_extra_code_table
)
3495 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3496 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3497 || (c
== ISO_CODE_CSI
3500 || ((*src
== '0' || *src
== '1' || *src
== '2')
3501 && src
+ 1 < src_end
3502 && src
[1] == ']')))))
3503 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3504 | CODING_CATEGORY_MASK_ISO_8BIT
);
3507 /* C is a character of ISO2022 in graphic plane right,
3508 or a SJIS's 1-byte character code (i.e. JISX0201),
3509 or the first byte of BIG5's 2-byte code,
3510 or the first byte of UTF-8/16. */
3511 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3512 | CODING_CATEGORY_MASK_ISO_8BIT
3513 | CODING_CATEGORY_MASK_SJIS
3514 | CODING_CATEGORY_MASK_BIG5
3515 | CODING_CATEGORY_MASK_UTF_8
3516 | CODING_CATEGORY_MASK_UTF_16_BE
3517 | CODING_CATEGORY_MASK_UTF_16_LE
);
3519 /* Or, we may have to consider the possibility of CCL. */
3520 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3521 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3522 ->spec
.ccl
.valid_codes
)[c
])
3523 try |= CODING_CATEGORY_MASK_CCL
;
3526 utf16_examined_p
= iso2022_examined_p
= 0;
3529 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3531 if (!iso2022_examined_p
3532 && (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
))
3534 mask
|= detect_coding_iso2022 (src
, src_end
);
3535 iso2022_examined_p
= 1;
3537 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3538 mask
|= detect_coding_sjis (src
, src_end
);
3539 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_UTF_8
)
3540 mask
|= detect_coding_utf_8 (src
, src_end
);
3541 else if (!utf16_examined_p
3542 && (priorities
[i
] & try &
3543 CODING_CATEGORY_MASK_UTF_16_BE_LE
))
3545 mask
|= detect_coding_utf_16 (src
, src_end
);
3546 utf16_examined_p
= 1;
3548 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3549 mask
|= detect_coding_big5 (src
, src_end
);
3550 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3551 mask
|= detect_coding_emacs_mule (src
, src_end
);
3552 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3553 mask
|= detect_coding_ccl (src
, src_end
);
3554 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3555 mask
|= CODING_CATEGORY_MASK_RAW_TEXT
;
3556 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3557 mask
|= CODING_CATEGORY_MASK_BINARY
;
3558 if (mask
& priorities
[i
])
3559 return priorities
[i
];
3561 return CODING_CATEGORY_MASK_RAW_TEXT
;
3563 if (try & CODING_CATEGORY_MASK_ISO
)
3564 mask
|= detect_coding_iso2022 (src
, src_end
);
3565 if (try & CODING_CATEGORY_MASK_SJIS
)
3566 mask
|= detect_coding_sjis (src
, src_end
);
3567 if (try & CODING_CATEGORY_MASK_BIG5
)
3568 mask
|= detect_coding_big5 (src
, src_end
);
3569 if (try & CODING_CATEGORY_MASK_UTF_8
)
3570 mask
|= detect_coding_utf_8 (src
, src_end
);
3571 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE
)
3572 mask
|= detect_coding_utf_16 (src
, src_end
);
3573 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3574 mask
|= detect_coding_emacs_mule (src
, src_end
);
3575 if (try & CODING_CATEGORY_MASK_CCL
)
3576 mask
|= detect_coding_ccl (src
, src_end
);
3578 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3581 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3582 The information of the detected coding system is set in CODING. */
3585 detect_coding (coding
, src
, src_bytes
)
3586 struct coding_system
*coding
;
3594 val
= Vcoding_category_list
;
3595 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3596 coding
->heading_ascii
= skip
;
3600 /* We found a single coding system of the highest priority in MASK. */
3602 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3604 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3606 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3608 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3612 tmp
= Fget (val
, Qeol_type
);
3614 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3617 /* Setup this new coding system while preserving some slots. */
3619 int src_multibyte
= coding
->src_multibyte
;
3620 int dst_multibyte
= coding
->dst_multibyte
;
3622 setup_coding_system (val
, coding
);
3623 coding
->src_multibyte
= src_multibyte
;
3624 coding
->dst_multibyte
= dst_multibyte
;
3625 coding
->heading_ascii
= skip
;
3629 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3630 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3631 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3633 How many non-eol characters are at the head is returned as *SKIP. */
3635 #define MAX_EOL_CHECK_COUNT 3
3638 detect_eol_type (source
, src_bytes
, skip
)
3639 unsigned char *source
;
3640 int src_bytes
, *skip
;
3642 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3644 int total
= 0; /* How many end-of-lines are found so far. */
3645 int eol_type
= CODING_EOL_UNDECIDED
;
3650 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3653 if (c
== '\n' || c
== '\r')
3656 *skip
= src
- 1 - source
;
3659 this_eol_type
= CODING_EOL_LF
;
3660 else if (src
>= src_end
|| *src
!= '\n')
3661 this_eol_type
= CODING_EOL_CR
;
3663 this_eol_type
= CODING_EOL_CRLF
, src
++;
3665 if (eol_type
== CODING_EOL_UNDECIDED
)
3666 /* This is the first end-of-line. */
3667 eol_type
= this_eol_type
;
3668 else if (eol_type
!= this_eol_type
)
3670 /* The found type is different from what found before. */
3671 eol_type
= CODING_EOL_INCONSISTENT
;
3678 *skip
= src_end
- source
;
3682 /* Like detect_eol_type, but detect EOL type in 2-octet
3683 big-endian/little-endian format for coding systems utf-16-be and
3687 detect_eol_type_in_2_octet_form (source
, src_bytes
, skip
, big_endian_p
)
3688 unsigned char *source
;
3689 int src_bytes
, *skip
;
3691 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3692 unsigned int c1
, c2
;
3693 int total
= 0; /* How many end-of-lines are found so far. */
3694 int eol_type
= CODING_EOL_UNDECIDED
;
3705 while ((src
+ 1) < src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3707 c1
= (src
[msb
] << 8) | (src
[lsb
]);
3710 if (c1
== '\n' || c1
== '\r')
3713 *skip
= src
- 2 - source
;
3717 this_eol_type
= CODING_EOL_LF
;
3721 if ((src
+ 1) >= src_end
)
3723 this_eol_type
= CODING_EOL_CR
;
3727 c2
= (src
[msb
] << 8) | (src
[lsb
]);
3729 this_eol_type
= CODING_EOL_CRLF
, src
+= 2;
3731 this_eol_type
= CODING_EOL_CR
;
3735 if (eol_type
== CODING_EOL_UNDECIDED
)
3736 /* This is the first end-of-line. */
3737 eol_type
= this_eol_type
;
3738 else if (eol_type
!= this_eol_type
)
3740 /* The found type is different from what found before. */
3741 eol_type
= CODING_EOL_INCONSISTENT
;
3748 *skip
= src_end
- source
;
3752 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3753 is encoded. If it detects an appropriate format of end-of-line, it
3754 sets the information in *CODING. */
3757 detect_eol (coding
, src
, src_bytes
)
3758 struct coding_system
*coding
;
3766 switch (coding
->category_idx
)
3768 case CODING_CATEGORY_IDX_UTF_16_BE
:
3769 eol_type
= detect_eol_type_in_2_octet_form (src
, src_bytes
, &skip
, 1);
3771 case CODING_CATEGORY_IDX_UTF_16_LE
:
3772 eol_type
= detect_eol_type_in_2_octet_form (src
, src_bytes
, &skip
, 0);
3775 eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3779 if (coding
->heading_ascii
> skip
)
3780 coding
->heading_ascii
= skip
;
3782 skip
= coding
->heading_ascii
;
3784 if (eol_type
== CODING_EOL_UNDECIDED
)
3786 if (eol_type
== CODING_EOL_INCONSISTENT
)
3789 /* This code is suppressed until we find a better way to
3790 distinguish raw text file and binary file. */
3792 /* If we have already detected that the coding is raw-text, the
3793 coding should actually be no-conversion. */
3794 if (coding
->type
== coding_type_raw_text
)
3796 setup_coding_system (Qno_conversion
, coding
);
3799 /* Else, let's decode only text code anyway. */
3801 eol_type
= CODING_EOL_LF
;
3804 val
= Fget (coding
->symbol
, Qeol_type
);
3805 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3807 int src_multibyte
= coding
->src_multibyte
;
3808 int dst_multibyte
= coding
->dst_multibyte
;
3810 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3811 coding
->src_multibyte
= src_multibyte
;
3812 coding
->dst_multibyte
= dst_multibyte
;
3813 coding
->heading_ascii
= skip
;
3817 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3819 #define DECODING_BUFFER_MAG(coding) \
3820 (coding->type == coding_type_iso2022 \
3822 : (coding->type == coding_type_ccl \
3823 ? coding->spec.ccl.decoder.buf_magnification \
3826 /* Return maximum size (bytes) of a buffer enough for decoding
3827 SRC_BYTES of text encoded in CODING. */
3830 decoding_buffer_size (coding
, src_bytes
)
3831 struct coding_system
*coding
;
3834 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3835 + CONVERSION_BUFFER_EXTRA_ROOM
);
3838 /* Return maximum size (bytes) of a buffer enough for encoding
3839 SRC_BYTES of text to CODING. */
3842 encoding_buffer_size (coding
, src_bytes
)
3843 struct coding_system
*coding
;
3848 if (coding
->type
== coding_type_ccl
)
3849 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3850 else if (CODING_REQUIRE_ENCODING (coding
))
3855 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3858 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3859 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3862 char *conversion_buffer
;
3863 int conversion_buffer_size
;
3865 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3866 or decoding. Sufficient memory is allocated automatically. If we
3867 run out of memory, return NULL. */
3870 get_conversion_buffer (size
)
3873 if (size
> conversion_buffer_size
)
3876 int real_size
= conversion_buffer_size
* 2;
3878 while (real_size
< size
) real_size
*= 2;
3879 buf
= (char *) xmalloc (real_size
);
3880 xfree (conversion_buffer
);
3881 conversion_buffer
= buf
;
3882 conversion_buffer_size
= real_size
;
3884 return conversion_buffer
;
3888 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3889 struct coding_system
*coding
;
3890 unsigned char *source
, *destination
;
3891 int src_bytes
, dst_bytes
, encodep
;
3893 struct ccl_program
*ccl
3894 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3897 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3899 ccl
->eol_type
= coding
->eol_type
;
3900 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3901 src_bytes
, dst_bytes
, &(coding
->consumed
));
3903 coding
->produced_char
= coding
->produced
;
3907 = dst_bytes
? dst_bytes
: source
+ coding
->consumed
- destination
;
3908 coding
->produced
= str_as_multibyte (destination
, bytes
,
3910 &(coding
->produced_char
));
3913 switch (ccl
->status
)
3915 case CCL_STAT_SUSPEND_BY_SRC
:
3916 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3918 case CCL_STAT_SUSPEND_BY_DST
:
3919 result
= CODING_FINISH_INSUFFICIENT_DST
;
3922 case CCL_STAT_INVALID_CMD
:
3923 result
= CODING_FINISH_INTERRUPT
;
3926 result
= CODING_FINISH_NORMAL
;
3932 /* Decode EOL format of the text at PTR of BYTES length destructively
3933 according to CODING->eol_type. This is called after the CCL
3934 program produced a decoded text at PTR. If we do CRLF->LF
3935 conversion, update CODING->produced and CODING->produced_char. */
3938 decode_eol_post_ccl (coding
, ptr
, bytes
)
3939 struct coding_system
*coding
;
3943 Lisp_Object val
, saved_coding_symbol
;
3944 unsigned char *pend
= ptr
+ bytes
;
3947 /* Remember the current coding system symbol. We set it back when
3948 an inconsistent EOL is found so that `last-coding-system-used' is
3949 set to the coding system that doesn't specify EOL conversion. */
3950 saved_coding_symbol
= coding
->symbol
;
3952 coding
->spec
.ccl
.cr_carryover
= 0;
3953 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3955 /* Here, to avoid the call of setup_coding_system, we directly
3956 call detect_eol_type. */
3957 coding
->eol_type
= detect_eol_type (ptr
, bytes
, &dummy
);
3958 if (coding
->eol_type
== CODING_EOL_INCONSISTENT
)
3959 coding
->eol_type
= CODING_EOL_LF
;
3960 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3962 val
= Fget (coding
->symbol
, Qeol_type
);
3963 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3964 coding
->symbol
= XVECTOR (val
)->contents
[coding
->eol_type
];
3966 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
3969 if (coding
->eol_type
== CODING_EOL_LF
3970 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3972 /* We have nothing to do. */
3975 else if (coding
->eol_type
== CODING_EOL_CRLF
)
3977 unsigned char *pstart
= ptr
, *p
= ptr
;
3979 if (! (coding
->mode
& CODING_MODE_LAST_BLOCK
)
3980 && *(pend
- 1) == '\r')
3982 /* If the last character is CR, we can't handle it here
3983 because LF will be in the not-yet-decoded source text.
3984 Recorded that the CR is not yet processed. */
3985 coding
->spec
.ccl
.cr_carryover
= 1;
3987 coding
->produced_char
--;
3994 if (ptr
+ 1 < pend
&& *(ptr
+ 1) == '\n')
4001 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
4002 goto undo_eol_conversion
;
4006 else if (*ptr
== '\n'
4007 && coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
4008 goto undo_eol_conversion
;
4013 undo_eol_conversion
:
4014 /* We have faced with inconsistent EOL format at PTR.
4015 Convert all LFs before PTR back to CRLFs. */
4016 for (p
--, ptr
--; p
>= pstart
; p
--)
4019 *ptr
-- = '\n', *ptr
-- = '\r';
4023 /* If carryover is recorded, cancel it because we don't
4024 convert CRLF anymore. */
4025 if (coding
->spec
.ccl
.cr_carryover
)
4027 coding
->spec
.ccl
.cr_carryover
= 0;
4029 coding
->produced_char
++;
4033 coding
->eol_type
= CODING_EOL_LF
;
4034 coding
->symbol
= saved_coding_symbol
;
4038 /* As each two-byte sequence CRLF was converted to LF, (PEND
4039 - P) is the number of deleted characters. */
4040 coding
->produced
-= pend
- p
;
4041 coding
->produced_char
-= pend
- p
;
4044 else /* i.e. coding->eol_type == CODING_EOL_CR */
4046 unsigned char *p
= ptr
;
4048 for (; ptr
< pend
; ptr
++)
4052 else if (*ptr
== '\n'
4053 && coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
4055 for (; p
< ptr
; p
++)
4061 coding
->eol_type
= CODING_EOL_LF
;
4062 coding
->symbol
= saved_coding_symbol
;
4068 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4069 decoding, it may detect coding system and format of end-of-line if
4070 those are not yet decided. The source should be unibyte, the
4071 result is multibyte if CODING->dst_multibyte is nonzero, else
4075 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
4076 struct coding_system
*coding
;
4077 unsigned char *source
, *destination
;
4078 int src_bytes
, dst_bytes
;
4080 if (coding
->type
== coding_type_undecided
)
4081 detect_coding (coding
, source
, src_bytes
);
4083 if (coding
->eol_type
== CODING_EOL_UNDECIDED
4084 && coding
->type
!= coding_type_ccl
)
4085 detect_eol (coding
, source
, src_bytes
);
4087 coding
->produced
= coding
->produced_char
= 0;
4088 coding
->consumed
= coding
->consumed_char
= 0;
4090 coding
->result
= CODING_FINISH_NORMAL
;
4092 switch (coding
->type
)
4094 case coding_type_sjis
:
4095 decode_coding_sjis_big5 (coding
, source
, destination
,
4096 src_bytes
, dst_bytes
, 1);
4099 case coding_type_iso2022
:
4100 decode_coding_iso2022 (coding
, source
, destination
,
4101 src_bytes
, dst_bytes
);
4104 case coding_type_big5
:
4105 decode_coding_sjis_big5 (coding
, source
, destination
,
4106 src_bytes
, dst_bytes
, 0);
4109 case coding_type_emacs_mule
:
4110 decode_coding_emacs_mule (coding
, source
, destination
,
4111 src_bytes
, dst_bytes
);
4114 case coding_type_ccl
:
4115 if (coding
->spec
.ccl
.cr_carryover
)
4117 /* Set the CR which is not processed by the previous call of
4118 decode_eol_post_ccl in DESTINATION. */
4119 *destination
= '\r';
4121 coding
->produced_char
++;
4124 ccl_coding_driver (coding
, source
,
4125 destination
+ coding
->spec
.ccl
.cr_carryover
,
4126 src_bytes
, dst_bytes
, 0);
4127 if (coding
->eol_type
!= CODING_EOL_LF
)
4128 decode_eol_post_ccl (coding
, destination
, coding
->produced
);
4132 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
4135 if (coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
4136 && coding
->consumed
== src_bytes
)
4137 coding
->result
= CODING_FINISH_NORMAL
;
4139 if (coding
->mode
& CODING_MODE_LAST_BLOCK
4140 && coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
)
4142 unsigned char *src
= source
+ coding
->consumed
;
4143 unsigned char *dst
= destination
+ coding
->produced
;
4145 src_bytes
-= coding
->consumed
;
4147 if (COMPOSING_P (coding
))
4148 DECODE_COMPOSITION_END ('1');
4152 dst
+= CHAR_STRING (c
, dst
);
4153 coding
->produced_char
++;
4155 coding
->consumed
= coding
->consumed_char
= src
- source
;
4156 coding
->produced
= dst
- destination
;
4159 if (!coding
->dst_multibyte
)
4161 coding
->produced
= str_as_unibyte (destination
, coding
->produced
);
4162 coding
->produced_char
= coding
->produced
;
4165 return coding
->result
;
4168 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4169 multibyteness of the source is CODING->src_multibyte, the
4170 multibyteness of the result is always unibyte. */
4173 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
4174 struct coding_system
*coding
;
4175 unsigned char *source
, *destination
;
4176 int src_bytes
, dst_bytes
;
4178 coding
->produced
= coding
->produced_char
= 0;
4179 coding
->consumed
= coding
->consumed_char
= 0;
4181 coding
->result
= CODING_FINISH_NORMAL
;
4183 switch (coding
->type
)
4185 case coding_type_sjis
:
4186 encode_coding_sjis_big5 (coding
, source
, destination
,
4187 src_bytes
, dst_bytes
, 1);
4190 case coding_type_iso2022
:
4191 encode_coding_iso2022 (coding
, source
, destination
,
4192 src_bytes
, dst_bytes
);
4195 case coding_type_big5
:
4196 encode_coding_sjis_big5 (coding
, source
, destination
,
4197 src_bytes
, dst_bytes
, 0);
4200 case coding_type_emacs_mule
:
4201 encode_coding_emacs_mule (coding
, source
, destination
,
4202 src_bytes
, dst_bytes
);
4205 case coding_type_ccl
:
4206 ccl_coding_driver (coding
, source
, destination
,
4207 src_bytes
, dst_bytes
, 1);
4211 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
4214 if (coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
4215 && coding
->consumed
== src_bytes
)
4216 coding
->result
= CODING_FINISH_NORMAL
;
4218 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
4220 unsigned char *src
= source
+ coding
->consumed
;
4221 unsigned char *src_end
= src
+ src_bytes
;
4222 unsigned char *dst
= destination
+ coding
->produced
;
4224 if (coding
->type
== coding_type_iso2022
)
4225 ENCODE_RESET_PLANE_AND_REGISTER
;
4226 if (COMPOSING_P (coding
))
4227 *dst
++ = ISO_CODE_ESC
, *dst
++ = '1';
4228 if (coding
->consumed
< src_bytes
)
4230 int len
= src_bytes
- coding
->consumed
;
4232 BCOPY_SHORT (source
+ coding
->consumed
, dst
, len
);
4233 if (coding
->src_multibyte
)
4234 len
= str_as_unibyte (dst
, len
);
4236 coding
->consumed
= src_bytes
;
4238 coding
->produced
= coding
->produced_char
= dst
- destination
;
4241 return coding
->result
;
4244 /* Scan text in the region between *BEG and *END (byte positions),
4245 skip characters which we don't have to decode by coding system
4246 CODING at the head and tail, then set *BEG and *END to the region
4247 of the text we actually have to convert. The caller should move
4248 the gap out of the region in advance if the region is from a
4251 If STR is not NULL, *BEG and *END are indices into STR. */
4254 shrink_decoding_region (beg
, end
, coding
, str
)
4256 struct coding_system
*coding
;
4259 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
4261 Lisp_Object translation_table
;
4263 if (coding
->type
== coding_type_ccl
4264 || coding
->type
== coding_type_undecided
4265 || coding
->eol_type
!= CODING_EOL_LF
4266 || !NILP (coding
->post_read_conversion
)
4267 || coding
->composing
!= COMPOSITION_DISABLED
)
4269 /* We can't skip any data. */
4272 if (coding
->type
== coding_type_no_conversion
4273 || coding
->type
== coding_type_raw_text
4274 || coding
->type
== coding_type_emacs_mule
)
4276 /* We need no conversion, but don't have to skip any data here.
4277 Decoding routine handles them effectively anyway. */
4281 translation_table
= coding
->translation_table_for_decode
;
4282 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4283 translation_table
= Vstandard_translation_table_for_decode
;
4284 if (CHAR_TABLE_P (translation_table
))
4287 for (i
= 0; i
< 128; i
++)
4288 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4291 /* Some ASCII character should be translated. We give up
4296 if (coding
->heading_ascii
>= 0)
4297 /* Detection routine has already found how much we can skip at the
4299 *beg
+= coding
->heading_ascii
;
4303 begp_orig
= begp
= str
+ *beg
;
4304 endp_orig
= endp
= str
+ *end
;
4308 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4309 endp_orig
= endp
= begp
+ *end
- *beg
;
4312 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4313 || coding
->eol_type
== CODING_EOL_CRLF
);
4315 switch (coding
->type
)
4317 case coding_type_sjis
:
4318 case coding_type_big5
:
4319 /* We can skip all ASCII characters at the head. */
4320 if (coding
->heading_ascii
< 0)
4323 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
4325 while (begp
< endp
&& *begp
< 0x80) begp
++;
4327 /* We can skip all ASCII characters at the tail except for the
4328 second byte of SJIS or BIG5 code. */
4330 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
4332 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4333 /* Do not consider LF as ascii if preceded by CR, since that
4334 confuses eol decoding. */
4335 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4337 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
4341 case coding_type_iso2022
:
4342 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4343 /* We can't skip any data. */
4345 if (coding
->heading_ascii
< 0)
4347 /* We can skip all ASCII characters at the head except for a
4348 few control codes. */
4349 while (begp
< endp
&& (c
= *begp
) < 0x80
4350 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
4351 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
4352 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
4355 switch (coding
->category_idx
)
4357 case CODING_CATEGORY_IDX_ISO_8_1
:
4358 case CODING_CATEGORY_IDX_ISO_8_2
:
4359 /* We can skip all ASCII characters at the tail. */
4361 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
4363 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4364 /* Do not consider LF as ascii if preceded by CR, since that
4365 confuses eol decoding. */
4366 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4370 case CODING_CATEGORY_IDX_ISO_7
:
4371 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
4373 /* We can skip all charactes at the tail except for 8-bit
4374 codes and ESC and the following 2-byte at the tail. */
4375 unsigned char *eight_bit
= NULL
;
4379 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
4381 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4386 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4388 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4391 /* Do not consider LF as ascii if preceded by CR, since that
4392 confuses eol decoding. */
4393 if (begp
< endp
&& endp
< endp_orig
4394 && endp
[-1] == '\r' && endp
[0] == '\n')
4396 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4398 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4399 /* This is an ASCII designation sequence. We can
4400 surely skip the tail. But, if we have
4401 encountered an 8-bit code, skip only the codes
4403 endp
= eight_bit
? eight_bit
: endp
+ 2;
4405 /* Hmmm, we can't skip the tail. */
4417 *beg
+= begp
- begp_orig
;
4418 *end
+= endp
- endp_orig
;
4422 /* Like shrink_decoding_region but for encoding. */
4425 shrink_encoding_region (beg
, end
, coding
, str
)
4427 struct coding_system
*coding
;
4430 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4432 Lisp_Object translation_table
;
4434 if (coding
->type
== coding_type_ccl
4435 || coding
->eol_type
== CODING_EOL_CRLF
4436 || coding
->eol_type
== CODING_EOL_CR
4437 || coding
->cmp_data
&& coding
->cmp_data
->used
> 0)
4439 /* We can't skip any data. */
4442 if (coding
->type
== coding_type_no_conversion
4443 || coding
->type
== coding_type_raw_text
4444 || coding
->type
== coding_type_emacs_mule
4445 || coding
->type
== coding_type_undecided
)
4447 /* We need no conversion, but don't have to skip any data here.
4448 Encoding routine handles them effectively anyway. */
4452 translation_table
= coding
->translation_table_for_encode
;
4453 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4454 translation_table
= Vstandard_translation_table_for_encode
;
4455 if (CHAR_TABLE_P (translation_table
))
4458 for (i
= 0; i
< 128; i
++)
4459 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4462 /* Some ASCII character should be tranlsated. We give up
4469 begp_orig
= begp
= str
+ *beg
;
4470 endp_orig
= endp
= str
+ *end
;
4474 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4475 endp_orig
= endp
= begp
+ *end
- *beg
;
4478 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4479 || coding
->eol_type
== CODING_EOL_CRLF
);
4481 /* Here, we don't have to check coding->pre_write_conversion because
4482 the caller is expected to have handled it already. */
4483 switch (coding
->type
)
4485 case coding_type_iso2022
:
4486 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4487 /* We can't skip any data. */
4489 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4491 unsigned char *bol
= begp
;
4492 while (begp
< endp
&& *begp
< 0x80)
4495 if (begp
[-1] == '\n')
4499 goto label_skip_tail
;
4503 case coding_type_sjis
:
4504 case coding_type_big5
:
4505 /* We can skip all ASCII characters at the head and tail. */
4507 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4509 while (begp
< endp
&& *begp
< 0x80) begp
++;
4512 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4514 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4521 *beg
+= begp
- begp_orig
;
4522 *end
+= endp
- endp_orig
;
4526 /* As shrinking conversion region requires some overhead, we don't try
4527 shrinking if the length of conversion region is less than this
4529 static int shrink_conversion_region_threshhold
= 1024;
4531 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4533 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4535 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4536 else shrink_decoding_region (beg, end, coding, str); \
4541 code_convert_region_unwind (dummy
)
4544 inhibit_pre_post_conversion
= 0;
4548 /* Store information about all compositions in the range FROM and TO
4549 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4550 buffer or a string, defaults to the current buffer. */
4553 coding_save_composition (coding
, from
, to
, obj
)
4554 struct coding_system
*coding
;
4561 if (coding
->composing
== COMPOSITION_DISABLED
)
4563 if (!coding
->cmp_data
)
4564 coding_allocate_composition_data (coding
, from
);
4565 if (!find_composition (from
, to
, &start
, &end
, &prop
, obj
)
4569 && (!find_composition (end
, to
, &start
, &end
, &prop
, obj
)
4572 coding
->composing
= COMPOSITION_NO
;
4575 if (COMPOSITION_VALID_P (start
, end
, prop
))
4577 enum composition_method method
= COMPOSITION_METHOD (prop
);
4578 if (coding
->cmp_data
->used
+ COMPOSITION_DATA_MAX_BUNCH_LENGTH
4579 >= COMPOSITION_DATA_SIZE
)
4580 coding_allocate_composition_data (coding
, from
);
4581 /* For relative composition, we remember start and end
4582 positions, for the other compositions, we also remember
4584 CODING_ADD_COMPOSITION_START (coding
, start
- from
, method
);
4585 if (method
!= COMPOSITION_RELATIVE
)
4587 /* We must store a*/
4588 Lisp_Object val
, ch
;
4590 val
= COMPOSITION_COMPONENTS (prop
);
4594 ch
= XCAR (val
), val
= XCDR (val
);
4595 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (ch
));
4597 else if (VECTORP (val
) || STRINGP (val
))
4599 int len
= (VECTORP (val
)
4600 ? XVECTOR (val
)->size
: XSTRING (val
)->size
);
4602 for (i
= 0; i
< len
; i
++)
4605 ? Faref (val
, make_number (i
))
4606 : XVECTOR (val
)->contents
[i
]);
4607 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (ch
));
4610 else /* INTEGERP (val) */
4611 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (val
));
4613 CODING_ADD_COMPOSITION_END (coding
, end
- from
);
4618 && find_composition (start
, to
, &start
, &end
, &prop
, obj
)
4621 /* Make coding->cmp_data point to the first memory block. */
4622 while (coding
->cmp_data
->prev
)
4623 coding
->cmp_data
= coding
->cmp_data
->prev
;
4624 coding
->cmp_data_start
= 0;
4627 /* Reflect the saved information about compositions to OBJ.
4628 CODING->cmp_data points to a memory block for the informaiton. OBJ
4629 is a buffer or a string, defaults to the current buffer. */
4632 coding_restore_composition (coding
, obj
)
4633 struct coding_system
*coding
;
4636 struct composition_data
*cmp_data
= coding
->cmp_data
;
4641 while (cmp_data
->prev
)
4642 cmp_data
= cmp_data
->prev
;
4648 for (i
= 0; i
< cmp_data
->used
; i
+= cmp_data
->data
[i
])
4650 int *data
= cmp_data
->data
+ i
;
4651 enum composition_method method
= (enum composition_method
) data
[3];
4652 Lisp_Object components
;
4654 if (method
== COMPOSITION_RELATIVE
)
4658 int len
= data
[0] - 4, j
;
4659 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
4661 for (j
= 0; j
< len
; j
++)
4662 args
[j
] = make_number (data
[4 + j
]);
4663 components
= (method
== COMPOSITION_WITH_ALTCHARS
4664 ? Fstring (len
, args
) : Fvector (len
, args
));
4666 compose_text (data
[1], data
[2], components
, Qnil
, obj
);
4668 cmp_data
= cmp_data
->next
;
4672 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4673 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4674 coding system CODING, and return the status code of code conversion
4675 (currently, this value has no meaning).
4677 How many characters (and bytes) are converted to how many
4678 characters (and bytes) are recorded in members of the structure
4681 If REPLACE is nonzero, we do various things as if the original text
4682 is deleted and a new text is inserted. See the comments in
4683 replace_range (insdel.c) to know what we are doing.
4685 If REPLACE is zero, it is assumed that the source text is unibyte.
4686 Otherwize, it is assumed that the source text is multibyte. */
4689 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4690 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4691 struct coding_system
*coding
;
4693 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4694 int require
, inserted
, inserted_byte
;
4695 int head_skip
, tail_skip
, total_skip
= 0;
4696 Lisp_Object saved_coding_symbol
;
4698 unsigned char *src
, *dst
;
4699 Lisp_Object deletion
;
4700 int orig_point
= PT
, orig_len
= len
;
4702 int multibyte_p
= !NILP (current_buffer
->enable_multibyte_characters
);
4704 coding
->src_multibyte
= replace
&& multibyte_p
;
4705 coding
->dst_multibyte
= multibyte_p
;
4708 saved_coding_symbol
= Qnil
;
4710 if (from
< PT
&& PT
< to
)
4712 TEMP_SET_PT_BOTH (from
, from_byte
);
4718 int saved_from
= from
;
4720 prepare_to_modify_buffer (from
, to
, &from
);
4721 if (saved_from
!= from
)
4724 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4725 len_byte
= to_byte
- from_byte
;
4729 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4731 /* We must detect encoding of text and eol format. */
4733 if (from
< GPT
&& to
> GPT
)
4734 move_gap_both (from
, from_byte
);
4735 if (coding
->type
== coding_type_undecided
)
4737 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4738 if (coding
->type
== coding_type_undecided
)
4739 /* It seems that the text contains only ASCII, but we
4740 should not left it undecided because the deeper
4741 decoding routine (decode_coding) tries to detect the
4742 encodings again in vain. */
4743 coding
->type
= coding_type_emacs_mule
;
4745 if (coding
->eol_type
== CODING_EOL_UNDECIDED
4746 && coding
->type
!= coding_type_ccl
)
4748 saved_coding_symbol
= coding
->symbol
;
4749 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4750 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4751 coding
->eol_type
= CODING_EOL_LF
;
4752 /* We had better recover the original eol format if we
4753 encounter an inconsitent eol format while decoding. */
4754 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4758 /* Now we convert the text. */
4760 /* For encoding, we must process pre-write-conversion in advance. */
4761 if (! inhibit_pre_post_conversion
4763 && SYMBOLP (coding
->pre_write_conversion
)
4764 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4766 /* The function in pre-write-conversion may put a new text in a
4768 struct buffer
*prev
= current_buffer
;
4770 int count
= specpdl_ptr
- specpdl
;
4772 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4773 /* We should not call any more pre-write/post-read-conversion
4774 functions while this pre-write-conversion is running. */
4775 inhibit_pre_post_conversion
= 1;
4776 call2 (coding
->pre_write_conversion
,
4777 make_number (from
), make_number (to
));
4778 inhibit_pre_post_conversion
= 0;
4779 /* Discard the unwind protect. */
4782 if (current_buffer
!= prev
)
4785 new = Fcurrent_buffer ();
4786 set_buffer_internal_1 (prev
);
4787 del_range_2 (from
, from_byte
, to
, to_byte
, 0);
4788 TEMP_SET_PT_BOTH (from
, from_byte
);
4789 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4791 if (orig_point
>= to
)
4792 orig_point
+= len
- orig_len
;
4793 else if (orig_point
> from
)
4797 from_byte
= CHAR_TO_BYTE (from
);
4798 to_byte
= CHAR_TO_BYTE (to
);
4799 len_byte
= to_byte
- from_byte
;
4800 TEMP_SET_PT_BOTH (from
, from_byte
);
4805 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4807 if (coding
->composing
!= COMPOSITION_DISABLED
)
4810 coding_save_composition (coding
, from
, to
, Fcurrent_buffer ());
4812 coding_allocate_composition_data (coding
, from
);
4815 /* Try to skip the heading and tailing ASCIIs. */
4816 if (coding
->type
!= coding_type_ccl
)
4818 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4820 if (from
< GPT
&& GPT
< to
)
4821 move_gap_both (from
, from_byte
);
4822 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4823 if (from_byte
== to_byte
4824 && (encodep
|| NILP (coding
->post_read_conversion
))
4825 && ! CODING_REQUIRE_FLUSHING (coding
))
4827 coding
->produced
= len_byte
;
4828 coding
->produced_char
= len
;
4830 /* We must record and adjust for this new text now. */
4831 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4835 head_skip
= from_byte
- from_byte_orig
;
4836 tail_skip
= to_byte_orig
- to_byte
;
4837 total_skip
= head_skip
+ tail_skip
;
4840 len
-= total_skip
; len_byte
-= total_skip
;
4843 /* The code conversion routine can not preserve text properties for
4844 now. So, we must remove all text properties in the region.
4845 Here, we must suppress all modification hooks. */
4848 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4849 inhibit_modification_hooks
= 1;
4850 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4851 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4854 /* For converion, we must put the gap before the text in addition to
4855 making the gap larger for efficient decoding. The required gap
4856 size starts from 2000 which is the magic number used in make_gap.
4857 But, after one batch of conversion, it will be incremented if we
4858 find that it is not enough . */
4861 if (GAP_SIZE
< require
)
4862 make_gap (require
- GAP_SIZE
);
4863 move_gap_both (from
, from_byte
);
4865 inserted
= inserted_byte
= 0;
4867 GAP_SIZE
+= len_byte
;
4870 ZV_BYTE
-= len_byte
;
4873 if (GPT
- BEG
< BEG_UNCHANGED
)
4874 BEG_UNCHANGED
= GPT
- BEG
;
4875 if (Z
- GPT
< END_UNCHANGED
)
4876 END_UNCHANGED
= Z
- GPT
;
4878 if (!encodep
&& coding
->src_multibyte
)
4880 /* Decoding routines expects that the source text is unibyte.
4881 We must convert 8-bit characters of multibyte form to
4883 int len_byte_orig
= len_byte
;
4884 len_byte
= str_as_unibyte (GAP_END_ADDR
- len_byte
, len_byte
);
4885 if (len_byte
< len_byte_orig
)
4886 safe_bcopy (GAP_END_ADDR
- len_byte_orig
, GAP_END_ADDR
- len_byte
,
4888 coding
->src_multibyte
= 0;
4895 /* The buffer memory is now:
4896 +--------+converted-text+---------+-------original-text-------+---+
4897 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4898 |<---------------------- GAP ----------------------->| */
4899 src
= GAP_END_ADDR
- len_byte
;
4900 dst
= GPT_ADDR
+ inserted_byte
;
4903 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4905 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4907 /* The buffer memory is now:
4908 +--------+-------converted-text----+--+------original-text----+---+
4909 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4910 |<---------------------- GAP ----------------------->| */
4912 inserted
+= coding
->produced_char
;
4913 inserted_byte
+= coding
->produced
;
4914 len_byte
-= coding
->consumed
;
4916 if (result
== CODING_FINISH_INSUFFICIENT_CMP
)
4918 coding_allocate_composition_data (coding
, from
+ inserted
);
4922 src
+= coding
->consumed
;
4923 dst
+= coding
->produced
;
4925 if (result
== CODING_FINISH_NORMAL
)
4930 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4932 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4933 Lisp_Object eol_type
;
4935 /* Encode LFs back to the original eol format (CR or CRLF). */
4936 if (coding
->eol_type
== CODING_EOL_CR
)
4938 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4944 while (p
< pend
) if (*p
++ == '\n') count
++;
4945 if (src
- dst
< count
)
4947 /* We don't have sufficient room for encoding LFs
4948 back to CRLF. We must record converted and
4949 not-yet-converted text back to the buffer
4950 content, enlarge the gap, then record them out of
4951 the buffer contents again. */
4952 int add
= len_byte
+ inserted_byte
;
4955 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4956 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4957 make_gap (count
- GAP_SIZE
);
4959 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4960 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4961 /* Don't forget to update SRC, DST, and PEND. */
4962 src
= GAP_END_ADDR
- len_byte
;
4963 dst
= GPT_ADDR
+ inserted_byte
;
4967 inserted_byte
+= count
;
4968 coding
->produced
+= count
;
4969 p
= dst
= pend
+ count
;
4973 if (*p
== '\n') count
--, *--p
= '\r';
4977 /* Suppress eol-format conversion in the further conversion. */
4978 coding
->eol_type
= CODING_EOL_LF
;
4980 /* Set the coding system symbol to that for Unix-like EOL. */
4981 eol_type
= Fget (saved_coding_symbol
, Qeol_type
);
4982 if (VECTORP (eol_type
)
4983 && XVECTOR (eol_type
)->size
== 3
4984 && SYMBOLP (XVECTOR (eol_type
)->contents
[CODING_EOL_LF
]))
4985 coding
->symbol
= XVECTOR (eol_type
)->contents
[CODING_EOL_LF
];
4987 coding
->symbol
= saved_coding_symbol
;
4993 if (coding
->type
!= coding_type_ccl
4994 || coding
->mode
& CODING_MODE_LAST_BLOCK
)
4996 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
4999 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
5001 /* The source text ends in invalid codes. Let's just
5002 make them valid buffer contents, and finish conversion. */
5003 inserted
+= len_byte
;
5004 inserted_byte
+= len_byte
;
5009 if (result
== CODING_FINISH_INTERRUPT
)
5011 /* The conversion procedure was interrupted by a user. */
5014 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5015 if (coding
->consumed
< 1)
5017 /* It's quite strange to require more memory without
5018 consuming any bytes. Perhaps CCL program bug. */
5023 /* We have just done the first batch of conversion which was
5024 stoped because of insufficient gap. Let's reconsider the
5025 required gap size (i.e. SRT - DST) now.
5027 We have converted ORIG bytes (== coding->consumed) into
5028 NEW bytes (coding->produced). To convert the remaining
5029 LEN bytes, we may need REQUIRE bytes of gap, where:
5030 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5031 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5032 Here, we are sure that NEW >= ORIG. */
5033 float ratio
= coding
->produced
- coding
->consumed
;
5034 ratio
/= coding
->consumed
;
5035 require
= len_byte
* ratio
;
5038 if ((src
- dst
) < (require
+ 2000))
5040 /* See the comment above the previous call of make_gap. */
5041 int add
= len_byte
+ inserted_byte
;
5044 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
5045 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
5046 make_gap (require
+ 2000);
5048 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
5049 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
5052 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
5054 if (encodep
&& coding
->dst_multibyte
)
5056 /* The output is unibyte. We must convert 8-bit characters to
5058 if (inserted_byte
* 2 > GAP_SIZE
)
5060 GAP_SIZE
-= inserted_byte
;
5061 ZV
+= inserted_byte
; Z
+= inserted_byte
;
5062 ZV_BYTE
+= inserted_byte
; Z_BYTE
+= inserted_byte
;
5063 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
5064 make_gap (inserted_byte
- GAP_SIZE
);
5065 GAP_SIZE
+= inserted_byte
;
5066 ZV
-= inserted_byte
; Z
-= inserted_byte
;
5067 ZV_BYTE
-= inserted_byte
; Z_BYTE
-= inserted_byte
;
5068 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
5070 inserted_byte
= str_to_multibyte (GPT_ADDR
, GAP_SIZE
, inserted_byte
);
5073 /* If we have shrinked the conversion area, adjust it now. */
5077 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
5078 inserted
+= total_skip
; inserted_byte
+= total_skip
;
5079 GAP_SIZE
+= total_skip
;
5080 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
5081 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
5082 Z
-= total_skip
; Z_BYTE
-= total_skip
;
5083 from
-= head_skip
; from_byte
-= head_skip
;
5084 to
+= tail_skip
; to_byte
+= tail_skip
;
5088 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
5089 inserted
= Z
- prev_Z
;
5091 if (!encodep
&& coding
->cmp_data
&& coding
->cmp_data
->used
)
5092 coding_restore_composition (coding
, Fcurrent_buffer ());
5093 coding_free_composition_data (coding
);
5095 if (! inhibit_pre_post_conversion
5096 && ! encodep
&& ! NILP (coding
->post_read_conversion
))
5099 int count
= specpdl_ptr
- specpdl
;
5102 TEMP_SET_PT_BOTH (from
, from_byte
);
5104 record_unwind_protect (code_convert_region_unwind
, Qnil
);
5105 /* We should not call any more pre-write/post-read-conversion
5106 functions while this post-read-conversion is running. */
5107 inhibit_pre_post_conversion
= 1;
5108 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
5109 inhibit_pre_post_conversion
= 0;
5110 /* Discard the unwind protect. */
5112 CHECK_NUMBER (val
, 0);
5113 inserted
+= Z
- prev_Z
;
5116 if (orig_point
>= from
)
5118 if (orig_point
>= from
+ orig_len
)
5119 orig_point
+= inserted
- orig_len
;
5122 TEMP_SET_PT (orig_point
);
5127 signal_after_change (from
, to
- from
, inserted
);
5128 update_compositions (from
, from
+ inserted
, CHECK_BORDER
);
5132 coding
->consumed
= to_byte
- from_byte
;
5133 coding
->consumed_char
= to
- from
;
5134 coding
->produced
= inserted_byte
;
5135 coding
->produced_char
= inserted
;
5142 run_pre_post_conversion_on_str (str
, coding
, encodep
)
5144 struct coding_system
*coding
;
5147 int count
= specpdl_ptr
- specpdl
;
5148 struct gcpro gcpro1
;
5149 struct buffer
*prev
= current_buffer
;
5150 int multibyte
= STRING_MULTIBYTE (str
);
5152 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
5153 record_unwind_protect (code_convert_region_unwind
, Qnil
);
5155 temp_output_buffer_setup (" *code-converting-work*");
5156 set_buffer_internal (XBUFFER (Vstandard_output
));
5157 /* We must insert the contents of STR as is without
5158 unibyte<->multibyte conversion. For that, we adjust the
5159 multibyteness of the working buffer to that of STR. */
5161 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
5162 insert_from_string (str
, 0, 0,
5163 XSTRING (str
)->size
, STRING_BYTES (XSTRING (str
)), 0);
5165 inhibit_pre_post_conversion
= 1;
5167 call2 (coding
->pre_write_conversion
, make_number (BEG
), make_number (Z
));
5170 TEMP_SET_PT_BOTH (BEG
, BEG_BYTE
);
5171 call1 (coding
->post_read_conversion
, make_number (Z
- BEG
));
5173 inhibit_pre_post_conversion
= 0;
5174 str
= make_buffer_string (BEG
, Z
, 0);
5175 return unbind_to (count
, str
);
5179 decode_coding_string (str
, coding
, nocopy
)
5181 struct coding_system
*coding
;
5186 int from
, to
, to_byte
;
5187 struct gcpro gcpro1
;
5188 Lisp_Object saved_coding_symbol
;
5192 to
= XSTRING (str
)->size
;
5193 to_byte
= STRING_BYTES (XSTRING (str
));
5195 saved_coding_symbol
= Qnil
;
5196 if (CODING_REQUIRE_DETECTION (coding
))
5198 /* See the comments in code_convert_region. */
5199 if (coding
->type
== coding_type_undecided
)
5201 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
5202 if (coding
->type
== coding_type_undecided
)
5203 coding
->type
= coding_type_emacs_mule
;
5205 if (coding
->eol_type
== CODING_EOL_UNDECIDED
5206 && coding
->type
!= coding_type_ccl
)
5208 saved_coding_symbol
= coding
->symbol
;
5209 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
5210 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
5211 coding
->eol_type
= CODING_EOL_LF
;
5212 /* We had better recover the original eol format if we
5213 encounter an inconsitent eol format while decoding. */
5214 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
5218 if (! CODING_REQUIRE_DECODING (coding
))
5220 if (!STRING_MULTIBYTE (str
))
5222 str
= Fstring_as_multibyte (str
);
5225 return (nocopy
? str
: Fcopy_sequence (str
));
5228 if (STRING_MULTIBYTE (str
))
5230 /* Decoding routines expect the source text to be unibyte. */
5231 str
= Fstring_as_unibyte (str
);
5232 to_byte
= STRING_BYTES (XSTRING (str
));
5234 coding
->src_multibyte
= 0;
5236 coding
->dst_multibyte
= 1;
5238 if (coding
->composing
!= COMPOSITION_DISABLED
)
5239 coding_allocate_composition_data (coding
, from
);
5241 /* Try to skip the heading and tailing ASCIIs. */
5242 if (coding
->type
!= coding_type_ccl
)
5244 int from_orig
= from
;
5246 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
5248 if (from
== to_byte
)
5249 return (nocopy
? str
: Fcopy_sequence (str
));
5252 len
= decoding_buffer_size (coding
, to_byte
- from
);
5253 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5255 buf
= get_conversion_buffer (len
);
5259 bcopy (XSTRING (str
)->data
, buf
, from
);
5260 result
= decode_coding (coding
, XSTRING (str
)->data
+ from
,
5261 buf
+ from
, to_byte
- from
, len
);
5262 if (result
== CODING_FINISH_INCONSISTENT_EOL
)
5264 /* We simply try to decode the whole string again but without
5265 eol-conversion this time. */
5266 coding
->eol_type
= CODING_EOL_LF
;
5267 coding
->symbol
= saved_coding_symbol
;
5268 coding_free_composition_data (coding
);
5269 return decode_coding_string (str
, coding
, nocopy
);
5272 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
5273 STRING_BYTES (XSTRING (str
)) - to_byte
);
5275 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5276 str
= make_multibyte_string (buf
, len
+ coding
->produced_char
,
5277 len
+ coding
->produced
);
5279 if (coding
->cmp_data
&& coding
->cmp_data
->used
)
5280 coding_restore_composition (coding
, str
);
5281 coding_free_composition_data (coding
);
5283 if (SYMBOLP (coding
->post_read_conversion
)
5284 && !NILP (Ffboundp (coding
->post_read_conversion
)))
5285 str
= run_pre_post_conversion_on_str (str
, coding
, 0);
5291 encode_coding_string (str
, coding
, nocopy
)
5293 struct coding_system
*coding
;
5298 int from
, to
, to_byte
;
5299 struct gcpro gcpro1
;
5300 Lisp_Object saved_coding_symbol
;
5303 if (SYMBOLP (coding
->pre_write_conversion
)
5304 && !NILP (Ffboundp (coding
->pre_write_conversion
)))
5305 str
= run_pre_post_conversion_on_str (str
, coding
, 1);
5308 to
= XSTRING (str
)->size
;
5309 to_byte
= STRING_BYTES (XSTRING (str
));
5311 saved_coding_symbol
= Qnil
;
5312 if (! CODING_REQUIRE_ENCODING (coding
))
5314 if (STRING_MULTIBYTE (str
))
5316 str
= Fstring_as_unibyte (str
);
5319 return (nocopy
? str
: Fcopy_sequence (str
));
5322 /* Encoding routines determine the multibyteness of the source text
5323 by coding->src_multibyte. */
5324 coding
->src_multibyte
= STRING_MULTIBYTE (str
);
5325 coding
->dst_multibyte
= 0;
5327 if (coding
->composing
!= COMPOSITION_DISABLED
)
5328 coding_save_composition (coding
, from
, to
, str
);
5330 /* Try to skip the heading and tailing ASCIIs. */
5331 if (coding
->type
!= coding_type_ccl
)
5333 int from_orig
= from
;
5335 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
5337 if (from
== to_byte
)
5338 return (nocopy
? str
: Fcopy_sequence (str
));
5341 len
= encoding_buffer_size (coding
, to_byte
- from
);
5342 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5344 buf
= get_conversion_buffer (len
);
5348 bcopy (XSTRING (str
)->data
, buf
, from
);
5349 result
= encode_coding (coding
, XSTRING (str
)->data
+ from
,
5350 buf
+ from
, to_byte
- from
, len
);
5351 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
5352 STRING_BYTES (XSTRING (str
)) - to_byte
);
5354 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5355 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
5356 coding_free_composition_data (coding
);
5363 /*** 8. Emacs Lisp library functions ***/
5365 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
5366 "Return t if OBJECT is nil or a coding-system.\n\
5367 See the documentation of `make-coding-system' for information\n\
5368 about coding-system objects.")
5376 /* Get coding-spec vector for OBJ. */
5377 obj
= Fget (obj
, Qcoding_system
);
5378 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
5382 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
5383 Sread_non_nil_coding_system
, 1, 1, 0,
5384 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5391 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
5392 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
5394 while (XSTRING (val
)->size
== 0);
5395 return (Fintern (val
, Qnil
));
5398 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
5399 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5400 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5401 (prompt
, default_coding_system
)
5402 Lisp_Object prompt
, default_coding_system
;
5405 if (SYMBOLP (default_coding_system
))
5406 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
5407 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
5408 Qt
, Qnil
, Qcoding_system_history
,
5409 default_coding_system
, Qnil
);
5410 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
5413 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
5415 "Check validity of CODING-SYSTEM.\n\
5416 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5417 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5418 The value of property should be a vector of length 5.")
5420 Lisp_Object coding_system
;
5422 CHECK_SYMBOL (coding_system
, 0);
5423 if (!NILP (Fcoding_system_p (coding_system
)))
5424 return coding_system
;
5426 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
5430 detect_coding_system (src
, src_bytes
, highest
)
5432 int src_bytes
, highest
;
5434 int coding_mask
, eol_type
;
5435 Lisp_Object val
, tmp
;
5438 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
5439 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
5440 if (eol_type
== CODING_EOL_INCONSISTENT
)
5441 eol_type
= CODING_EOL_UNDECIDED
;
5446 if (eol_type
!= CODING_EOL_UNDECIDED
)
5449 val2
= Fget (Qundecided
, Qeol_type
);
5451 val
= XVECTOR (val2
)->contents
[eol_type
];
5453 return (highest
? val
: Fcons (val
, Qnil
));
5456 /* At first, gather possible coding systems in VAL. */
5458 for (tmp
= Vcoding_category_list
; CONSP (tmp
); tmp
= XCDR (tmp
))
5460 Lisp_Object category_val
, category_index
;
5462 category_index
= Fget (XCAR (tmp
), Qcoding_category_index
);
5463 category_val
= Fsymbol_value (XCAR (tmp
));
5464 if (!NILP (category_val
)
5465 && NATNUMP (category_index
)
5466 && (coding_mask
& (1 << XFASTINT (category_index
))))
5468 val
= Fcons (category_val
, val
);
5474 val
= Fnreverse (val
);
5476 /* Then, replace the elements with subsidiary coding systems. */
5477 for (tmp
= val
; CONSP (tmp
); tmp
= XCDR (tmp
))
5479 if (eol_type
!= CODING_EOL_UNDECIDED
5480 && eol_type
!= CODING_EOL_INCONSISTENT
)
5483 eol
= Fget (XCAR (tmp
), Qeol_type
);
5485 XCAR (tmp
) = XVECTOR (eol
)->contents
[eol_type
];
5488 return (highest
? XCAR (val
) : val
);
5491 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
5493 "Detect coding system of the text in the region between START and END.\n\
5494 Return a list of possible coding systems ordered by priority.\n\
5496 If only ASCII characters are found, it returns a list of single element\n\
5497 `undecided' or its subsidiary coding system according to a detected\n\
5498 end-of-line format.\n\
5500 If optional argument HIGHEST is non-nil, return the coding system of\n\
5502 (start
, end
, highest
)
5503 Lisp_Object start
, end
, highest
;
5506 int from_byte
, to_byte
;
5508 CHECK_NUMBER_COERCE_MARKER (start
, 0);
5509 CHECK_NUMBER_COERCE_MARKER (end
, 1);
5511 validate_region (&start
, &end
);
5512 from
= XINT (start
), to
= XINT (end
);
5513 from_byte
= CHAR_TO_BYTE (from
);
5514 to_byte
= CHAR_TO_BYTE (to
);
5516 if (from
< GPT
&& to
>= GPT
)
5517 move_gap_both (to
, to_byte
);
5519 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
5520 to_byte
- from_byte
,
5524 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
5526 "Detect coding system of the text in STRING.\n\
5527 Return a list of possible coding systems ordered by priority.\n\
5529 If only ASCII characters are found, it returns a list of single element\n\
5530 `undecided' or its subsidiary coding system according to a detected\n\
5531 end-of-line format.\n\
5533 If optional argument HIGHEST is non-nil, return the coding system of\n\
5536 Lisp_Object string
, highest
;
5538 CHECK_STRING (string
, 0);
5540 return detect_coding_system (XSTRING (string
)->data
,
5541 STRING_BYTES (XSTRING (string
)),
5546 code_convert_region1 (start
, end
, coding_system
, encodep
)
5547 Lisp_Object start
, end
, coding_system
;
5550 struct coding_system coding
;
5553 CHECK_NUMBER_COERCE_MARKER (start
, 0);
5554 CHECK_NUMBER_COERCE_MARKER (end
, 1);
5555 CHECK_SYMBOL (coding_system
, 2);
5557 validate_region (&start
, &end
);
5558 from
= XFASTINT (start
);
5559 to
= XFASTINT (end
);
5561 if (NILP (coding_system
))
5562 return make_number (to
- from
);
5564 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5565 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5567 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5568 coding
.src_multibyte
= coding
.dst_multibyte
5569 = !NILP (current_buffer
->enable_multibyte_characters
);
5570 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
5571 &coding
, encodep
, 1);
5572 Vlast_coding_system_used
= coding
.symbol
;
5573 return make_number (coding
.produced_char
);
5576 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
5577 3, 3, "r\nzCoding system: ",
5578 "Decode the current region by specified coding system.\n\
5579 When called from a program, takes three arguments:\n\
5580 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5581 This function sets `last-coding-system-used' to the precise coding system\n\
5582 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5583 not fully specified.)\n\
5584 It returns the length of the decoded text.")
5585 (start
, end
, coding_system
)
5586 Lisp_Object start
, end
, coding_system
;
5588 return code_convert_region1 (start
, end
, coding_system
, 0);
5591 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
5592 3, 3, "r\nzCoding system: ",
5593 "Encode the current region by specified coding system.\n\
5594 When called from a program, takes three arguments:\n\
5595 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5596 This function sets `last-coding-system-used' to the precise coding system\n\
5597 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5598 not fully specified.)\n\
5599 It returns the length of the encoded text.")
5600 (start
, end
, coding_system
)
5601 Lisp_Object start
, end
, coding_system
;
5603 return code_convert_region1 (start
, end
, coding_system
, 1);
5607 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
5608 Lisp_Object string
, coding_system
, nocopy
;
5611 struct coding_system coding
;
5613 CHECK_STRING (string
, 0);
5614 CHECK_SYMBOL (coding_system
, 1);
5616 if (NILP (coding_system
))
5617 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
5619 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5620 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5622 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5624 ? encode_coding_string (string
, &coding
, !NILP (nocopy
))
5625 : decode_coding_string (string
, &coding
, !NILP (nocopy
)));
5626 Vlast_coding_system_used
= coding
.symbol
;
5631 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
5633 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5634 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5635 if the decoding operation is trivial.\n\
5636 This function sets `last-coding-system-used' to the precise coding system\n\
5637 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5638 not fully specified.)")
5639 (string
, coding_system
, nocopy
)
5640 Lisp_Object string
, coding_system
, nocopy
;
5642 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
5645 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
5647 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5648 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5649 if the encoding operation is trivial.\n\
5650 This function sets `last-coding-system-used' to the precise coding system\n\
5651 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5652 not fully specified.)")
5653 (string
, coding_system
, nocopy
)
5654 Lisp_Object string
, coding_system
, nocopy
;
5656 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
5659 /* Encode or decode STRING according to CODING_SYSTEM.
5660 Do not set Vlast_coding_system_used.
5662 This function is called only from macros DECODE_FILE and
5663 ENCODE_FILE, thus we ignore character composition. */
5666 code_convert_string_norecord (string
, coding_system
, encodep
)
5667 Lisp_Object string
, coding_system
;
5670 struct coding_system coding
;
5672 CHECK_STRING (string
, 0);
5673 CHECK_SYMBOL (coding_system
, 1);
5675 if (NILP (coding_system
))
5678 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5679 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5681 coding
.composing
= COMPOSITION_DISABLED
;
5682 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5684 ? encode_coding_string (string
, &coding
, 1)
5685 : decode_coding_string (string
, &coding
, 1));
5688 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5689 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5690 Return the corresponding character.")
5694 unsigned char c1
, c2
, s1
, s2
;
5697 CHECK_NUMBER (code
, 0);
5698 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5702 XSETFASTINT (val
, s2
);
5703 else if (s2
>= 0xA0 || s2
<= 0xDF)
5704 XSETFASTINT (val
, MAKE_CHAR (charset_katakana_jisx0201
, s2
, 0));
5706 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5710 if ((s1
< 0x80 || s1
> 0x9F && s1
< 0xE0 || s1
> 0xEF)
5711 || (s2
< 0x40 || s2
== 0x7F || s2
> 0xFC))
5712 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5713 DECODE_SJIS (s1
, s2
, c1
, c2
);
5714 XSETFASTINT (val
, MAKE_CHAR (charset_jisx0208
, c1
, c2
));
5719 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5720 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5721 Return the corresponding code in SJIS.")
5725 int charset
, c1
, c2
, s1
, s2
;
5728 CHECK_NUMBER (ch
, 0);
5729 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5730 if (charset
== CHARSET_ASCII
)
5734 else if (charset
== charset_jisx0208
5735 && c1
> 0x20 && c1
< 0x7F && c2
> 0x20 && c2
< 0x7F)
5737 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5738 XSETFASTINT (val
, (s1
<< 8) | s2
);
5740 else if (charset
== charset_katakana_jisx0201
5741 && c1
> 0x20 && c2
< 0xE0)
5743 XSETFASTINT (val
, c1
| 0x80);
5746 error ("Can't encode to shift_jis: %d", XFASTINT (ch
));
5750 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5751 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5752 Return the corresponding character.")
5757 unsigned char b1
, b2
, c1
, c2
;
5760 CHECK_NUMBER (code
, 0);
5761 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5765 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5770 if ((b1
< 0xA1 || b1
> 0xFE)
5771 || (b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE))
5772 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5773 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5774 XSETFASTINT (val
, MAKE_CHAR (charset
, c1
, c2
));
5779 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5780 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5781 Return the corresponding character code in Big5.")
5785 int charset
, c1
, c2
, b1
, b2
;
5788 CHECK_NUMBER (ch
, 0);
5789 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5790 if (charset
== CHARSET_ASCII
)
5794 else if ((charset
== charset_big5_1
5795 && (XFASTINT (ch
) >= 0x250a1 && XFASTINT (ch
) <= 0x271ec))
5796 || (charset
== charset_big5_2
5797 && XFASTINT (ch
) >= 0x290a1 && XFASTINT (ch
) <= 0x2bdb2))
5799 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5800 XSETFASTINT (val
, (b1
<< 8) | b2
);
5803 error ("Can't encode to Big5: %d", XFASTINT (ch
));
5807 DEFUN ("set-terminal-coding-system-internal",
5808 Fset_terminal_coding_system_internal
,
5809 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5811 Lisp_Object coding_system
;
5813 CHECK_SYMBOL (coding_system
, 0);
5814 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5815 /* We had better not send unsafe characters to terminal. */
5816 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5817 /* Characer composition should be disabled. */
5818 terminal_coding
.composing
= COMPOSITION_DISABLED
;
5819 terminal_coding
.src_multibyte
= 1;
5820 terminal_coding
.dst_multibyte
= 0;
5824 DEFUN ("set-safe-terminal-coding-system-internal",
5825 Fset_safe_terminal_coding_system_internal
,
5826 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5828 Lisp_Object coding_system
;
5830 CHECK_SYMBOL (coding_system
, 0);
5831 setup_coding_system (Fcheck_coding_system (coding_system
),
5832 &safe_terminal_coding
);
5833 /* Characer composition should be disabled. */
5834 safe_terminal_coding
.composing
= COMPOSITION_DISABLED
;
5835 safe_terminal_coding
.src_multibyte
= 1;
5836 safe_terminal_coding
.dst_multibyte
= 0;
5840 DEFUN ("terminal-coding-system",
5841 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5842 "Return coding system specified for terminal output.")
5845 return terminal_coding
.symbol
;
5848 DEFUN ("set-keyboard-coding-system-internal",
5849 Fset_keyboard_coding_system_internal
,
5850 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5852 Lisp_Object coding_system
;
5854 CHECK_SYMBOL (coding_system
, 0);
5855 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5856 /* Characer composition should be disabled. */
5857 keyboard_coding
.composing
= COMPOSITION_DISABLED
;
5861 DEFUN ("keyboard-coding-system",
5862 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5863 "Return coding system specified for decoding keyboard input.")
5866 return keyboard_coding
.symbol
;
5870 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5871 Sfind_operation_coding_system
, 1, MANY
, 0,
5872 "Choose a coding system for an operation based on the target name.\n\
5873 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5874 DECODING-SYSTEM is the coding system to use for decoding\n\
5875 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5876 for encoding (in case OPERATION does encoding).\n\
5878 The first argument OPERATION specifies an I/O primitive:\n\
5879 For file I/O, `insert-file-contents' or `write-region'.\n\
5880 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5881 For network I/O, `open-network-stream'.\n\
5883 The remaining arguments should be the same arguments that were passed\n\
5884 to the primitive. Depending on which primitive, one of those arguments\n\
5885 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5886 whichever argument specifies the file name is TARGET.\n\
5888 TARGET has a meaning which depends on OPERATION:\n\
5889 For file I/O, TARGET is a file name.\n\
5890 For process I/O, TARGET is a process name.\n\
5891 For network I/O, TARGET is a service name or a port number\n\
5893 This function looks up what specified for TARGET in,\n\
5894 `file-coding-system-alist', `process-coding-system-alist',\n\
5895 or `network-coding-system-alist' depending on OPERATION.\n\
5896 They may specify a coding system, a cons of coding systems,\n\
5897 or a function symbol to call.\n\
5898 In the last case, we call the function with one argument,\n\
5899 which is a list of all the arguments given to this function.")
5904 Lisp_Object operation
, target_idx
, target
, val
;
5905 register Lisp_Object chain
;
5908 error ("Too few arguments");
5909 operation
= args
[0];
5910 if (!SYMBOLP (operation
)
5911 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5912 error ("Invalid first arguement");
5913 if (nargs
< 1 + XINT (target_idx
))
5914 error ("Too few arguments for operation: %s",
5915 XSYMBOL (operation
)->name
->data
);
5916 target
= args
[XINT (target_idx
) + 1];
5917 if (!(STRINGP (target
)
5918 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5919 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5921 chain
= ((EQ (operation
, Qinsert_file_contents
)
5922 || EQ (operation
, Qwrite_region
))
5923 ? Vfile_coding_system_alist
5924 : (EQ (operation
, Qopen_network_stream
)
5925 ? Vnetwork_coding_system_alist
5926 : Vprocess_coding_system_alist
));
5930 for (; CONSP (chain
); chain
= XCDR (chain
))
5936 && ((STRINGP (target
)
5937 && STRINGP (XCAR (elt
))
5938 && fast_string_match (XCAR (elt
), target
) >= 0)
5939 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
5942 /* Here, if VAL is both a valid coding system and a valid
5943 function symbol, we return VAL as a coding system. */
5946 if (! SYMBOLP (val
))
5948 if (! NILP (Fcoding_system_p (val
)))
5949 return Fcons (val
, val
);
5950 if (! NILP (Ffboundp (val
)))
5952 val
= call1 (val
, Flist (nargs
, args
));
5955 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5956 return Fcons (val
, val
);
5964 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5965 Supdate_coding_systems_internal
, 0, 0, 0,
5966 "Update internal database for ISO2022 and CCL based coding systems.\n\
5967 When values of any coding categories are changed, you must\n\
5968 call this function")
5973 for (i
= CODING_CATEGORY_IDX_EMACS_MULE
; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5977 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5980 if (! coding_system_table
[i
])
5981 coding_system_table
[i
] = ((struct coding_system
*)
5982 xmalloc (sizeof (struct coding_system
)));
5983 setup_coding_system (val
, coding_system_table
[i
]);
5985 else if (coding_system_table
[i
])
5987 xfree (coding_system_table
[i
]);
5988 coding_system_table
[i
] = NULL
;
5995 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5996 Sset_coding_priority_internal
, 0, 0, 0,
5997 "Update internal database for the current value of `coding-category-list'.\n\
5998 This function is internal use only.")
6004 val
= Vcoding_category_list
;
6006 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
6008 if (! SYMBOLP (XCAR (val
)))
6010 idx
= XFASTINT (Fget (XCAR (val
), Qcoding_category_index
));
6011 if (idx
>= CODING_CATEGORY_IDX_MAX
)
6013 coding_priorities
[i
++] = (1 << idx
);
6016 /* If coding-category-list is valid and contains all coding
6017 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
6018 the following code saves Emacs from crashing. */
6019 while (i
< CODING_CATEGORY_IDX_MAX
)
6020 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
6028 /*** 9. Post-amble ***/
6033 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
6041 /* Emacs' internal format specific initialize routine. */
6042 for (i
= 0; i
<= 0x20; i
++)
6043 emacs_code_class
[i
] = EMACS_control_code
;
6044 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
6045 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
6046 for (i
= 0x21 ; i
< 0x7F; i
++)
6047 emacs_code_class
[i
] = EMACS_ascii_code
;
6048 emacs_code_class
[0x7F] = EMACS_control_code
;
6049 for (i
= 0x80; i
< 0xFF; i
++)
6050 emacs_code_class
[i
] = EMACS_invalid_code
;
6051 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
6052 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
6053 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
6054 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
6056 /* ISO2022 specific initialize routine. */
6057 for (i
= 0; i
< 0x20; i
++)
6058 iso_code_class
[i
] = ISO_control_0
;
6059 for (i
= 0x21; i
< 0x7F; i
++)
6060 iso_code_class
[i
] = ISO_graphic_plane_0
;
6061 for (i
= 0x80; i
< 0xA0; i
++)
6062 iso_code_class
[i
] = ISO_control_1
;
6063 for (i
= 0xA1; i
< 0xFF; i
++)
6064 iso_code_class
[i
] = ISO_graphic_plane_1
;
6065 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
6066 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
6067 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
6068 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
6069 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
6070 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
6071 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
6072 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
6073 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
6074 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
6076 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
6078 setup_coding_system (Qnil
, &keyboard_coding
);
6079 setup_coding_system (Qnil
, &terminal_coding
);
6080 setup_coding_system (Qnil
, &safe_terminal_coding
);
6081 setup_coding_system (Qnil
, &default_buffer_file_coding
);
6083 bzero (coding_system_table
, sizeof coding_system_table
);
6085 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
6086 for (i
= 0; i
< 128; i
++)
6087 ascii_skip_code
[i
] = 1;
6089 #if defined (MSDOS) || defined (WINDOWSNT)
6090 system_eol_type
= CODING_EOL_CRLF
;
6092 system_eol_type
= CODING_EOL_LF
;
6095 inhibit_pre_post_conversion
= 0;
6103 Qtarget_idx
= intern ("target-idx");
6104 staticpro (&Qtarget_idx
);
6106 Qcoding_system_history
= intern ("coding-system-history");
6107 staticpro (&Qcoding_system_history
);
6108 Fset (Qcoding_system_history
, Qnil
);
6110 /* Target FILENAME is the first argument. */
6111 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
6112 /* Target FILENAME is the third argument. */
6113 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
6115 Qcall_process
= intern ("call-process");
6116 staticpro (&Qcall_process
);
6117 /* Target PROGRAM is the first argument. */
6118 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
6120 Qcall_process_region
= intern ("call-process-region");
6121 staticpro (&Qcall_process_region
);
6122 /* Target PROGRAM is the third argument. */
6123 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
6125 Qstart_process
= intern ("start-process");
6126 staticpro (&Qstart_process
);
6127 /* Target PROGRAM is the third argument. */
6128 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
6130 Qopen_network_stream
= intern ("open-network-stream");
6131 staticpro (&Qopen_network_stream
);
6132 /* Target SERVICE is the fourth argument. */
6133 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
6135 Qcoding_system
= intern ("coding-system");
6136 staticpro (&Qcoding_system
);
6138 Qeol_type
= intern ("eol-type");
6139 staticpro (&Qeol_type
);
6141 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
6142 staticpro (&Qbuffer_file_coding_system
);
6144 Qpost_read_conversion
= intern ("post-read-conversion");
6145 staticpro (&Qpost_read_conversion
);
6147 Qpre_write_conversion
= intern ("pre-write-conversion");
6148 staticpro (&Qpre_write_conversion
);
6150 Qno_conversion
= intern ("no-conversion");
6151 staticpro (&Qno_conversion
);
6153 Qundecided
= intern ("undecided");
6154 staticpro (&Qundecided
);
6156 Qcoding_system_p
= intern ("coding-system-p");
6157 staticpro (&Qcoding_system_p
);
6159 Qcoding_system_error
= intern ("coding-system-error");
6160 staticpro (&Qcoding_system_error
);
6162 Fput (Qcoding_system_error
, Qerror_conditions
,
6163 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
6164 Fput (Qcoding_system_error
, Qerror_message
,
6165 build_string ("Invalid coding system"));
6167 Qcoding_category
= intern ("coding-category");
6168 staticpro (&Qcoding_category
);
6169 Qcoding_category_index
= intern ("coding-category-index");
6170 staticpro (&Qcoding_category_index
);
6172 Vcoding_category_table
6173 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
6174 staticpro (&Vcoding_category_table
);
6177 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
6179 XVECTOR (Vcoding_category_table
)->contents
[i
]
6180 = intern (coding_category_name
[i
]);
6181 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
6182 Qcoding_category_index
, make_number (i
));
6186 Qtranslation_table
= intern ("translation-table");
6187 staticpro (&Qtranslation_table
);
6188 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
6190 Qtranslation_table_id
= intern ("translation-table-id");
6191 staticpro (&Qtranslation_table_id
);
6193 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
6194 staticpro (&Qtranslation_table_for_decode
);
6196 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
6197 staticpro (&Qtranslation_table_for_encode
);
6199 Qsafe_charsets
= intern ("safe-charsets");
6200 staticpro (&Qsafe_charsets
);
6202 Qvalid_codes
= intern ("valid-codes");
6203 staticpro (&Qvalid_codes
);
6205 Qemacs_mule
= intern ("emacs-mule");
6206 staticpro (&Qemacs_mule
);
6208 Qraw_text
= intern ("raw-text");
6209 staticpro (&Qraw_text
);
6211 defsubr (&Scoding_system_p
);
6212 defsubr (&Sread_coding_system
);
6213 defsubr (&Sread_non_nil_coding_system
);
6214 defsubr (&Scheck_coding_system
);
6215 defsubr (&Sdetect_coding_region
);
6216 defsubr (&Sdetect_coding_string
);
6217 defsubr (&Sdecode_coding_region
);
6218 defsubr (&Sencode_coding_region
);
6219 defsubr (&Sdecode_coding_string
);
6220 defsubr (&Sencode_coding_string
);
6221 defsubr (&Sdecode_sjis_char
);
6222 defsubr (&Sencode_sjis_char
);
6223 defsubr (&Sdecode_big5_char
);
6224 defsubr (&Sencode_big5_char
);
6225 defsubr (&Sset_terminal_coding_system_internal
);
6226 defsubr (&Sset_safe_terminal_coding_system_internal
);
6227 defsubr (&Sterminal_coding_system
);
6228 defsubr (&Sset_keyboard_coding_system_internal
);
6229 defsubr (&Skeyboard_coding_system
);
6230 defsubr (&Sfind_operation_coding_system
);
6231 defsubr (&Supdate_coding_systems_internal
);
6232 defsubr (&Sset_coding_priority_internal
);
6234 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
6235 "List of coding systems.\n\
6237 Do not alter the value of this variable manually. This variable should be\n\
6238 updated by the functions `make-coding-system' and\n\
6239 `define-coding-system-alias'.");
6240 Vcoding_system_list
= Qnil
;
6242 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
6243 "Alist of coding system names.\n\
6244 Each element is one element list of coding system name.\n\
6245 This variable is given to `completing-read' as TABLE argument.\n\
6247 Do not alter the value of this variable manually. This variable should be\n\
6248 updated by the functions `make-coding-system' and\n\
6249 `define-coding-system-alias'.");
6250 Vcoding_system_alist
= Qnil
;
6252 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
6253 "List of coding-categories (symbols) ordered by priority.");
6257 Vcoding_category_list
= Qnil
;
6258 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
6259 Vcoding_category_list
6260 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
6261 Vcoding_category_list
);
6264 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
6265 "Specify the coding system for read operations.\n\
6266 It is useful to bind this variable with `let', but do not set it globally.\n\
6267 If the value is a coding system, it is used for decoding on read operation.\n\
6268 If not, an appropriate element is used from one of the coding system alists:\n\
6269 There are three such tables, `file-coding-system-alist',\n\
6270 `process-coding-system-alist', and `network-coding-system-alist'.");
6271 Vcoding_system_for_read
= Qnil
;
6273 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
6274 "Specify the coding system for write operations.\n\
6275 Programs bind this variable with `let', but you should not set it globally.\n\
6276 If the value is a coding system, it is used for encoding of output,\n\
6277 when writing it to a file and when sending it to a file or subprocess.\n\
6279 If this does not specify a coding system, an appropriate element\n\
6280 is used from one of the coding system alists:\n\
6281 There are three such tables, `file-coding-system-alist',\n\
6282 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6283 For output to files, if the above procedure does not specify a coding system,\n\
6284 the value of `buffer-file-coding-system' is used.");
6285 Vcoding_system_for_write
= Qnil
;
6287 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
6288 "Coding system used in the latest file or process I/O.");
6289 Vlast_coding_system_used
= Qnil
;
6291 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
6292 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6293 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6295 inhibit_eol_conversion
= 0;
6297 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
6298 "Non-nil means process buffer inherits coding system of process output.\n\
6299 Bind it to t if the process output is to be treated as if it were a file\n\
6300 read from some filesystem.");
6301 inherit_process_coding_system
= 0;
6303 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
6304 "Alist to decide a coding system to use for a file I/O operation.\n\
6305 The format is ((PATTERN . VAL) ...),\n\
6306 where PATTERN is a regular expression matching a file name,\n\
6307 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6308 If VAL is a coding system, it is used for both decoding and encoding\n\
6309 the file contents.\n\
6310 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6311 and the cdr part is used for encoding.\n\
6312 If VAL is a function symbol, the function must return a coding system\n\
6313 or a cons of coding systems which are used as above.\n\
6315 See also the function `find-operation-coding-system'\n\
6316 and the variable `auto-coding-alist'.");
6317 Vfile_coding_system_alist
= Qnil
;
6319 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
6320 "Alist to decide a coding system to use for a process I/O operation.\n\
6321 The format is ((PATTERN . VAL) ...),\n\
6322 where PATTERN is a regular expression matching a program name,\n\
6323 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6324 If VAL is a coding system, it is used for both decoding what received\n\
6325 from the program and encoding what sent to the program.\n\
6326 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6327 and the cdr part is used for encoding.\n\
6328 If VAL is a function symbol, the function must return a coding system\n\
6329 or a cons of coding systems which are used as above.\n\
6331 See also the function `find-operation-coding-system'.");
6332 Vprocess_coding_system_alist
= Qnil
;
6334 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
6335 "Alist to decide a coding system to use for a network I/O operation.\n\
6336 The format is ((PATTERN . VAL) ...),\n\
6337 where PATTERN is a regular expression matching a network service name\n\
6338 or is a port number to connect to,\n\
6339 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6340 If VAL is a coding system, it is used for both decoding what received\n\
6341 from the network stream and encoding what sent to the network stream.\n\
6342 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6343 and the cdr part is used for encoding.\n\
6344 If VAL is a function symbol, the function must return a coding system\n\
6345 or a cons of coding systems which are used as above.\n\
6347 See also the function `find-operation-coding-system'.");
6348 Vnetwork_coding_system_alist
= Qnil
;
6350 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
6351 "Coding system to use with system messages.");
6352 Vlocale_coding_system
= Qnil
;
6354 /* The eol mnemonics are reset in startup.el system-dependently. */
6355 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
6356 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6357 eol_mnemonic_unix
= build_string (":");
6359 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
6360 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6361 eol_mnemonic_dos
= build_string ("\\");
6363 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
6364 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6365 eol_mnemonic_mac
= build_string ("/");
6367 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
6368 "*String displayed in mode line when end-of-line format is not yet determined.");
6369 eol_mnemonic_undecided
= build_string (":");
6371 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
6372 "*Non-nil enables character translation while encoding and decoding.");
6373 Venable_character_translation
= Qt
;
6375 DEFVAR_LISP ("standard-translation-table-for-decode",
6376 &Vstandard_translation_table_for_decode
,
6377 "Table for translating characters while decoding.");
6378 Vstandard_translation_table_for_decode
= Qnil
;
6380 DEFVAR_LISP ("standard-translation-table-for-encode",
6381 &Vstandard_translation_table_for_encode
,
6382 "Table for translationg characters while encoding.");
6383 Vstandard_translation_table_for_encode
= Qnil
;
6385 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
6386 "Alist of charsets vs revision numbers.\n\
6387 While encoding, if a charset (car part of an element) is found,\n\
6388 designate it with the escape sequence identifing revision (cdr part of the element).");
6389 Vcharset_revision_alist
= Qnil
;
6391 DEFVAR_LISP ("default-process-coding-system",
6392 &Vdefault_process_coding_system
,
6393 "Cons of coding systems used for process I/O by default.\n\
6394 The car part is used for decoding a process output,\n\
6395 the cdr part is used for encoding a text to be sent to a process.");
6396 Vdefault_process_coding_system
= Qnil
;
6398 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
6399 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6400 This is a vector of length 256.\n\
6401 If Nth element is non-nil, the existence of code N in a file\n\
6402 \(or output of subprocess) doesn't prevent it to be detected as\n\
6403 a coding system of ISO 2022 variant which has a flag\n\
6404 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6405 or reading output of a subprocess.\n\
6406 Only 128th through 159th elements has a meaning.");
6407 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
6409 DEFVAR_LISP ("select-safe-coding-system-function",
6410 &Vselect_safe_coding_system_function
,
6411 "Function to call to select safe coding system for encoding a text.\n\
6413 If set, this function is called to force a user to select a proper\n\
6414 coding system which can encode the text in the case that a default\n\
6415 coding system used in each operation can't encode the text.\n\
6417 The default value is `select-safe-coding-system' (which see).");
6418 Vselect_safe_coding_system_function
= Qnil
;
6420 DEFVAR_BOOL ("inhibit-iso-eacape-detection",
6421 &inhibit_iso_escape_detection
,
6422 "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6424 By default, on reading a file, Emacs tries to detect how the text is\n\
6425 encoded. This code detection is sensitive to escape sequences. If\n\
6426 the sequence is valid as ISO2022, the code is detemined as one of\n\
6427 ISO2022 encoding, and the file is decoded by the corresponding coding\n\
6428 system (e.g. `iso-2022-7bit').\n\
6430 However, there may be a case that you want to read escape sequences in\n\
6431 a file as is. In such a case, you can set this variable to non-nil.\n\
6432 Then, as the code detection ignores any escape sequences, no file is\n\
6433 detected as some of ISO2022 encoding. The result is that all escape\n\
6434 sequences become visible in a buffer.\n\
6436 The default value is nil, and it is strongly recommended not to change\n\
6437 it. That is because many Emacs Lisp source files that contain\n\
6438 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6439 in Emacs's distribution, and they won't be decoded correctly on\n\
6440 reading if you suppress escapse sequence detection.\n\
6442 The other way to read escape sequences in a file without decoding is\n\
6443 to explicitely specify some coding system that doesn't use ISO2022's\n\
6444 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6445 inhibit_iso_escape_detection
= 0;
6449 emacs_strerror (error_number
)
6454 synchronize_system_messages_locale ();
6455 str
= strerror (error_number
);
6457 if (! NILP (Vlocale_coding_system
))
6459 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
6460 Vlocale_coding_system
,
6462 str
= (char *) XSTRING (dec
)->data
;