1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is usually one byte of
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
113 detect_coding_emacs_mule (src
, src_end
)
114 unsigned char *src
, *src_end
;
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
136 Below is a template of these functions. */
138 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
139 struct coding_system
*coding
;
140 unsigned char *source
, *destination
;
141 int src_bytes
, dst_bytes
;
147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
151 a place pointed to by DESTINATION, the length of which should not
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
163 Below is a template of these functions. */
165 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
166 struct coding_system
*coding
;
167 unsigned char *source
, *destination
;
168 int src_bytes
, dst_bytes
;
174 /*** COMMONLY USED MACROS ***/
176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
182 #define ONE_MORE_BYTE(c1) \
187 goto label_end_of_loop; \
190 #define TWO_MORE_BYTES(c1, c2) \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
195 goto label_end_of_loop; \
198 #define THREE_MORE_BYTES(c1, c2, c3) \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
203 goto label_end_of_loop; \
206 /* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
213 /* Decode one ASCII character C. */
215 #define DECODE_CHARACTER_ASCII(c) \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
222 coding->produced_char++; \
226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
227 position-code is C. */
229 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
245 position-codes are C1 and C2. */
247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
254 /*** 1. Preamble ***/
268 #else /* not emacs */
272 #endif /* not emacs */
274 Lisp_Object Qcoding_system
, Qeol_type
;
275 Lisp_Object Qbuffer_file_coding_system
;
276 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
277 Lisp_Object Qno_conversion
, Qundecided
;
278 Lisp_Object Qcoding_system_history
;
279 Lisp_Object Qsafe_charsets
;
281 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
282 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
283 Lisp_Object Qstart_process
, Qopen_network_stream
;
284 Lisp_Object Qtarget_idx
;
286 Lisp_Object Vselect_safe_coding_system_function
;
288 /* Mnemonic character of each format of end-of-line. */
289 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
290 /* Mnemonic character to indicate format of end-of-line is not yet
292 int eol_mnemonic_undecided
;
294 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
300 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
302 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
304 /* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306 Lisp_Object Qemacs_mule
, Qraw_text
;
308 /* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310 /* Coding-system for reading files and receiving data from process. */
311 Lisp_Object Vcoding_system_for_read
;
312 /* Coding-system for writing files and sending data to process. */
313 Lisp_Object Vcoding_system_for_write
;
314 /* Coding-system actually used in the latest I/O. */
315 Lisp_Object Vlast_coding_system_used
;
317 /* A vector of length 256 which contains information about special
318 Latin codes (espepcially for dealing with Microsoft code). */
319 Lisp_Object Vlatin_extra_code_table
;
321 /* Flag to inhibit code conversion of end-of-line format. */
322 int inhibit_eol_conversion
;
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding
;
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding
;
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding
;
334 Lisp_Object Vfile_coding_system_alist
;
335 Lisp_Object Vprocess_coding_system_alist
;
336 Lisp_Object Vnetwork_coding_system_alist
;
340 Lisp_Object Qcoding_category
, Qcoding_category_index
;
342 /* List of symbols `coding-category-xxx' ordered by priority. */
343 Lisp_Object Vcoding_category_list
;
345 /* Table of coding categories (Lisp symbols). */
346 Lisp_Object Vcoding_category_table
;
348 /* Table of names of symbol for each coding-category. */
349 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
350 "coding-category-emacs-mule",
351 "coding-category-sjis",
352 "coding-category-iso-7",
353 "coding-category-iso-7-tight",
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
358 "coding-category-big5",
359 "coding-category-raw-text",
360 "coding-category-binary"
363 /* Table pointers to coding systems corresponding to each coding
365 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
367 /* Flag to tell if we look up unification table on character code
369 Lisp_Object Venable_character_unification
;
370 /* Standard unification table to look up on decoding (reading). */
371 Lisp_Object Vstandard_character_unification_table_for_decode
;
372 /* Standard unification table to look up on encoding (writing). */
373 Lisp_Object Vstandard_character_unification_table_for_encode
;
375 Lisp_Object Qcharacter_unification_table
;
376 Lisp_Object Qcharacter_unification_table_for_decode
;
377 Lisp_Object Qcharacter_unification_table_for_encode
;
379 /* Alist of charsets vs revision number. */
380 Lisp_Object Vcharset_revision_alist
;
382 /* Default coding systems used for process I/O. */
383 Lisp_Object Vdefault_process_coding_system
;
386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
388 /* Emacs' internal format for encoding multiple character sets is a
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
401 There's one exception to this rule. Special leading-code
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
419 enum emacs_code_class_type emacs_code_class
[256];
421 /* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423 #define CHECK_CODE_RANGE_A0_FF \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
436 detect_coding_emacs_mule (src
, src_end
)
437 unsigned char *src
, *src_end
;
442 while (src
< src_end
)
454 switch (emacs_code_class
[c
])
456 case EMACS_ascii_code
:
457 case EMACS_linefeed_code
:
460 case EMACS_control_code
:
461 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
465 case EMACS_invalid_code
:
468 case EMACS_leading_code_composition
: /* c == 0x80 */
470 CHECK_CODE_RANGE_A0_FF
;
475 case EMACS_leading_code_4
:
476 CHECK_CODE_RANGE_A0_FF
;
477 /* fall down to check it two more times ... */
479 case EMACS_leading_code_3
:
480 CHECK_CODE_RANGE_A0_FF
;
481 /* fall down to check it one more time ... */
483 case EMACS_leading_code_2
:
484 CHECK_CODE_RANGE_A0_FF
;
492 return CODING_CATEGORY_MASK_EMACS_MULE
;
496 /*** 3. ISO2022 handlers ***/
498 /* The following note describes the coding system ISO2022 briefly.
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
504 ISO2022 provides many mechanisms to encode several character sets
505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
506 all text is encoded by codes of less than 128. This may make the
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
510 There are two kinds of character set: control character set and
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
513 functions are provided also by escape sequences). The latter
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
527 Note (*): ECMA = European Computer Manufacturers Association
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
574 SO (shift-out) 0x0E none invoke G1 to GL
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
616 Now you may notice that there are a lot of ways for encoding the
617 same multilingual text in ISO2022. Actually, there exists many
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
641 enum iso_code_class_type iso_code_class
[256];
643 #define CHARSET_OK(idx, charset) \
644 (coding_system_table[idx]->safe_charsets[charset] \
645 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
646 (coding_system_table[idx], charset) \
647 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
649 #define SHIFT_OUT_OK(idx) \
650 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
652 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
653 Check if a text is encoded in ISO2022. If it is, returns an
654 integer in which appropriate flag bits any of:
655 CODING_CATEGORY_MASK_ISO_7
656 CODING_CATEGORY_MASK_ISO_7_TIGHT
657 CODING_CATEGORY_MASK_ISO_8_1
658 CODING_CATEGORY_MASK_ISO_8_2
659 CODING_CATEGORY_MASK_ISO_7_ELSE
660 CODING_CATEGORY_MASK_ISO_8_ELSE
661 are set. If a code which should never appear in ISO2022 is found,
665 detect_coding_iso2022 (src
, src_end
)
666 unsigned char *src
, *src_end
;
668 int mask
= CODING_CATEGORY_MASK_ISO
;
670 int reg
[4], shift_out
= 0;
671 int c
, c1
, i
, charset
;
673 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
674 while (mask
&& src
< src_end
)
683 if (c
>= '(' && c
<= '/')
685 /* Designation sequence for a charset of dimension 1. */
689 if (c1
< ' ' || c1
>= 0x80
690 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
691 /* Invalid designation sequence. Just ignore. */
693 reg
[(c
- '(') % 4] = charset
;
697 /* Designation sequence for a charset of dimension 2. */
701 if (c
>= '@' && c
<= 'B')
702 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
703 reg
[0] = charset
= iso_charset_table
[1][0][c
];
704 else if (c
>= '(' && c
<= '/')
709 if (c1
< ' ' || c1
>= 0x80
710 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
711 /* Invalid designation sequence. Just ignore. */
713 reg
[(c
- '(') % 4] = charset
;
716 /* Invalid designation sequence. Just ignore. */
719 else if (c
== 'N' || c
== 'n')
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
724 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
726 /* Locking shift out. */
727 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
728 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
733 else if (c
== 'O' || c
== 'o')
737 /* Locking shift in. */
738 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
739 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
744 else if (c
== '0' || c
== '1' || c
== '2')
745 /* Start/end composition. Just ignore. */
748 /* Invalid escape sequence. Just ignore. */
751 /* We found a valid designation sequence for CHARSET. */
752 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
753 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
754 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
756 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
757 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
758 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
760 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
761 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
762 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
763 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
764 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
771 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
773 /* Locking shift out. */
774 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
775 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
782 /* Locking shift in. */
783 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
784 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
792 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
794 if (c
!= ISO_CODE_CSI
)
796 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
797 & CODING_FLAG_ISO_SINGLE_SHIFT
)
798 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
799 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
800 & CODING_FLAG_ISO_SINGLE_SHIFT
)
801 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
803 if (VECTORP (Vlatin_extra_code_table
)
804 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
806 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
807 & CODING_FLAG_ISO_LATIN_EXTRA
)
808 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
809 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
810 & CODING_FLAG_ISO_LATIN_EXTRA
)
811 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
814 mask_found
|= newmask
;
823 if (VECTORP (Vlatin_extra_code_table
)
824 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
828 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
829 & CODING_FLAG_ISO_LATIN_EXTRA
)
830 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
831 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
832 & CODING_FLAG_ISO_LATIN_EXTRA
)
833 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
835 mask_found
|= newmask
;
842 unsigned char *src_begin
= src
;
844 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
845 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
846 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
847 while (src
< src_end
&& *src
>= 0xA0)
849 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
850 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
852 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
858 return (mask
& mask_found
);
861 /* Decode a character of which charset is CHARSET and the 1st position
862 code is C1. If dimension of CHARSET is 2, the 2nd position code is
863 fetched from SRC and set to C2. If CHARSET is negative, it means
864 that we are decoding ill formed text, and what we can do is just to
867 #define DECODE_ISO_CHARACTER(charset, c1) \
869 int c_alt, charset_alt = (charset); \
870 if (COMPOSING_HEAD_P (coding->composing)) \
872 *dst++ = LEADING_CODE_COMPOSITION; \
873 if (COMPOSING_WITH_RULE_P (coding->composing)) \
874 /* To tell composition rules are embeded. */ \
876 coding->composing += 2; \
878 if ((charset) >= 0) \
880 if (CHARSET_DIMENSION (charset) == 2) \
882 ONE_MORE_BYTE (c2); \
883 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
884 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
890 if (!NILP (unification_table) \
891 && ((c_alt = unify_char (unification_table, \
892 -1, (charset), c1, c2)) >= 0)) \
893 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
895 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
896 DECODE_CHARACTER_ASCII (c1); \
897 else if (CHARSET_DIMENSION (charset_alt) == 1) \
898 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
900 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
901 if (COMPOSING_WITH_RULE_P (coding->composing)) \
902 /* To tell a composition rule follows. */ \
903 coding->composing = COMPOSING_WITH_RULE_RULE; \
906 /* Set designation state into CODING. */
907 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
909 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
910 make_number (chars), \
911 make_number (final_char)); \
913 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
914 || coding->safe_charsets[charset])) \
916 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
918 && charset == CHARSET_ASCII) \
920 /* We should insert this designation sequence as is so \
921 that it is surely written back to a file. */ \
922 coding->spec.iso2022.last_invalid_designation_register = -1; \
923 goto label_invalid_code; \
925 coding->spec.iso2022.last_invalid_designation_register = -1; \
926 if ((coding->mode & CODING_MODE_DIRECTION) \
927 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
928 charset = CHARSET_REVERSE_CHARSET (charset); \
929 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
933 coding->spec.iso2022.last_invalid_designation_register = reg; \
934 goto label_invalid_code; \
938 /* Check if the current composing sequence contains only valid codes.
939 If the composing sequence doesn't end before SRC_END, return -1.
940 Else, if it contains only valid codes, return 0.
941 Else return the length of the composing sequence. */
943 int check_composing_code (coding
, src
, src_end
)
944 struct coding_system
*coding
;
945 unsigned char *src
, *src_end
;
947 unsigned char *src_start
= src
;
948 int invalid_code_found
= 0;
949 int charset
, c
, c1
, dim
;
951 while (src
< src_end
)
953 if (*src
++ != ISO_CODE_ESC
) continue;
954 if (src
>= src_end
) break;
955 if ((c
= *src
++) == '1') /* end of compsition */
956 return (invalid_code_found
? src
- src_start
: 0);
957 if (src
+ 2 >= src_end
) break;
958 if (!coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
959 invalid_code_found
= 1;
966 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
968 if (c
>= '(' && c
<= '/')
971 if ((c1
< ' ' || c1
>= 0x80)
972 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
973 || ! coding
->safe_charsets
[charset
]
974 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
975 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
976 invalid_code_found
= 1;
979 invalid_code_found
= 1;
982 return ((coding
->mode
& CODING_MODE_LAST_BLOCK
) ? src_end
- src_start
: -1);
985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
988 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
989 struct coding_system
*coding
;
990 unsigned char *source
, *destination
;
991 int src_bytes
, dst_bytes
;
993 unsigned char *src
= source
;
994 unsigned char *src_end
= source
+ src_bytes
;
995 unsigned char *dst
= destination
;
996 unsigned char *dst_end
= destination
+ dst_bytes
;
997 /* Since the maximum bytes produced by each loop is 7, we subtract 6
998 from DST_END to assure that overflow checking is necessary only
999 at the head of loop. */
1000 unsigned char *adjusted_dst_end
= dst_end
- 6;
1002 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1003 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1004 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1005 Lisp_Object unification_table
1006 = coding
->character_unification_table_for_decode
;
1007 int result
= CODING_FINISH_NORMAL
;
1009 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1010 unification_table
= Vstandard_character_unification_table_for_decode
;
1012 coding
->produced_char
= 0;
1013 coding
->fake_multibyte
= 0;
1014 while (src
< src_end
&& (dst_bytes
1015 ? (dst
< adjusted_dst_end
)
1018 /* SRC_BASE remembers the start position in source in each loop.
1019 The loop will be exited when there's not enough source text
1020 to analyze long escape sequence or 2-byte code (within macros
1021 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1022 to SRC_BASE before exiting. */
1023 unsigned char *src_base
= src
;
1024 int c1
= *src
++, c2
;
1026 switch (iso_code_class
[c1
])
1028 case ISO_0x20_or_0x7F
:
1029 if (!coding
->composing
1030 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1032 /* This is SPACE or DEL. */
1034 coding
->produced_char
++;
1037 /* This is a graphic character, we fall down ... */
1039 case ISO_graphic_plane_0
:
1040 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1042 /* This is a composition rule. */
1044 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1047 DECODE_ISO_CHARACTER (charset0
, c1
);
1050 case ISO_0xA0_or_0xFF
:
1051 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1052 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1053 goto label_invalid_code
;
1054 /* This is a graphic character, we fall down ... */
1056 case ISO_graphic_plane_1
:
1057 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1058 goto label_invalid_code
;
1060 DECODE_ISO_CHARACTER (charset1
, c1
);
1063 case ISO_control_code
:
1064 /* All ISO2022 control characters in this class have the
1065 same representation in Emacs internal format. */
1067 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1068 && (coding
->eol_type
== CODING_EOL_CR
1069 || coding
->eol_type
== CODING_EOL_CRLF
))
1071 result
= CODING_FINISH_INCONSISTENT_EOL
;
1072 goto label_end_of_loop_2
;
1075 coding
->produced_char
++;
1078 case ISO_carriage_return
:
1079 if (coding
->eol_type
== CODING_EOL_CR
)
1081 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1084 if (c1
== ISO_CODE_LF
)
1088 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1090 result
= CODING_FINISH_INCONSISTENT_EOL
;
1091 goto label_end_of_loop_2
;
1099 coding
->produced_char
++;
1103 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1104 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1105 goto label_invalid_code
;
1106 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1107 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1111 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1112 goto label_invalid_code
;
1113 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1114 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1117 case ISO_single_shift_2_7
:
1118 case ISO_single_shift_2
:
1119 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1120 goto label_invalid_code
;
1121 /* SS2 is handled as an escape sequence of ESC 'N' */
1123 goto label_escape_sequence
;
1125 case ISO_single_shift_3
:
1126 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1127 goto label_invalid_code
;
1128 /* SS2 is handled as an escape sequence of ESC 'O' */
1130 goto label_escape_sequence
;
1132 case ISO_control_sequence_introducer
:
1133 /* CSI is handled as an escape sequence of ESC '[' ... */
1135 goto label_escape_sequence
;
1139 label_escape_sequence
:
1140 /* Escape sequences handled by Emacs are invocation,
1141 designation, direction specification, and character
1142 composition specification. */
1145 case '&': /* revision of following character set */
1147 if (!(c1
>= '@' && c1
<= '~'))
1148 goto label_invalid_code
;
1150 if (c1
!= ISO_CODE_ESC
)
1151 goto label_invalid_code
;
1153 goto label_escape_sequence
;
1155 case '$': /* designation of 2-byte character set */
1156 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1157 goto label_invalid_code
;
1159 if (c1
>= '@' && c1
<= 'B')
1160 { /* designation of JISX0208.1978, GB2312.1980,
1162 DECODE_DESIGNATION (0, 2, 94, c1
);
1164 else if (c1
>= 0x28 && c1
<= 0x2B)
1165 { /* designation of DIMENSION2_CHARS94 character set */
1167 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1169 else if (c1
>= 0x2C && c1
<= 0x2F)
1170 { /* designation of DIMENSION2_CHARS96 character set */
1172 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1175 goto label_invalid_code
;
1178 case 'n': /* invocation of locking-shift-2 */
1179 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1180 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1181 goto label_invalid_code
;
1182 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1183 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1186 case 'o': /* invocation of locking-shift-3 */
1187 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1188 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1189 goto label_invalid_code
;
1190 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1191 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1194 case 'N': /* invocation of single-shift-2 */
1195 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1196 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1197 goto label_invalid_code
;
1199 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1200 DECODE_ISO_CHARACTER (charset
, c1
);
1203 case 'O': /* invocation of single-shift-3 */
1204 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1205 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1206 goto label_invalid_code
;
1208 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1209 DECODE_ISO_CHARACTER (charset
, c1
);
1212 case '0': case '2': /* start composing */
1213 /* Before processing composing, we must be sure that all
1214 characters being composed are supported by CODING.
1215 If not, we must give up composing and insert the
1216 bunch of codes for composing as is without decoding. */
1220 result1
= check_composing_code (coding
, src
, src_end
);
1222 coding
->composing
= (c1
== '0'
1223 ? COMPOSING_NO_RULE_HEAD
1224 : COMPOSING_WITH_RULE_HEAD
);
1225 else if (result1
> 0)
1227 if (result1
+ 2 < (dst_bytes
? dst_end
: src_base
) - dst
)
1229 bcopy (src_base
, dst
, result1
+ 2);
1232 coding
->produced_char
+= result1
+ 2;
1236 result
= CODING_FINISH_INSUFFICIENT_DST
;
1237 goto label_end_of_loop_2
;
1241 goto label_end_of_loop
;
1245 case '1': /* end composing */
1246 coding
->composing
= COMPOSING_NO
;
1247 coding
->produced_char
++;
1250 case '[': /* specification of direction */
1251 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1252 goto label_invalid_code
;
1253 /* For the moment, nested direction is not supported.
1254 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1255 left-to-right, and nozero means right-to-left. */
1259 case ']': /* end of the current direction */
1260 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1262 case '0': /* end of the current direction */
1263 case '1': /* start of left-to-right direction */
1266 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1268 goto label_invalid_code
;
1271 case '2': /* start of right-to-left direction */
1274 coding
->mode
|= CODING_MODE_DIRECTION
;
1276 goto label_invalid_code
;
1280 goto label_invalid_code
;
1285 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1286 goto label_invalid_code
;
1287 if (c1
>= 0x28 && c1
<= 0x2B)
1288 { /* designation of DIMENSION1_CHARS94 character set */
1290 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1292 else if (c1
>= 0x2C && c1
<= 0x2F)
1293 { /* designation of DIMENSION1_CHARS96 character set */
1295 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1299 goto label_invalid_code
;
1302 /* We must update these variables now. */
1303 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1304 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1308 while (src_base
< src
)
1309 *dst
++ = *src_base
++;
1310 coding
->fake_multibyte
= 1;
1315 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1316 label_end_of_loop_2
:
1323 if (result
== CODING_FINISH_NORMAL
)
1324 result
= CODING_FINISH_INSUFFICIENT_DST
;
1325 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1326 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1328 /* This is the last block of the text to be decoded. We had
1329 better just flush out all remaining codes in the text
1330 although they are not valid characters. */
1331 src_bytes
= src_end
- src
;
1332 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1333 src_bytes
= dst_end
- dst
;
1334 bcopy (src
, dst
, src_bytes
);
1337 coding
->fake_multibyte
= 1;
1341 coding
->consumed
= coding
->consumed_char
= src
- source
;
1342 coding
->produced
= dst
- destination
;
1346 /* ISO2022 encoding stuff. */
1349 It is not enough to say just "ISO2022" on encoding, we have to
1350 specify more details. In Emacs, each coding system of ISO2022
1351 variant has the following specifications:
1352 1. Initial designation to G0 thru G3.
1353 2. Allows short-form designation?
1354 3. ASCII should be designated to G0 before control characters?
1355 4. ASCII should be designated to G0 at end of line?
1356 5. 7-bit environment or 8-bit environment?
1357 6. Use locking-shift?
1358 7. Use Single-shift?
1359 And the following two are only for Japanese:
1360 8. Use ASCII in place of JIS0201-1976-Roman?
1361 9. Use JISX0208-1983 in place of JISX0208-1978?
1362 These specifications are encoded in `coding->flags' as flag bits
1363 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1367 /* Produce codes (escape sequence) for designating CHARSET to graphic
1368 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1369 the coding system CODING allows, produce designation sequence of
1372 #define ENCODE_DESIGNATION(charset, reg, coding) \
1374 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1375 char *intermediate_char_94 = "()*+"; \
1376 char *intermediate_char_96 = ",-./"; \
1377 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1378 if (revision < 255) \
1380 *dst++ = ISO_CODE_ESC; \
1382 *dst++ = '@' + revision; \
1384 *dst++ = ISO_CODE_ESC; \
1385 if (CHARSET_DIMENSION (charset) == 1) \
1387 if (CHARSET_CHARS (charset) == 94) \
1388 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1390 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1395 if (CHARSET_CHARS (charset) == 94) \
1397 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1399 || final_char < '@' || final_char > 'B') \
1400 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1403 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1405 *dst++ = final_char; \
1406 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1409 /* The following two macros produce codes (control character or escape
1410 sequence) for ISO2022 single-shift functions (single-shift-2 and
1413 #define ENCODE_SINGLE_SHIFT_2 \
1415 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1416 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1419 *dst++ = ISO_CODE_SS2; \
1420 coding->fake_multibyte = 1; \
1422 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1425 #define ENCODE_SINGLE_SHIFT_3 \
1427 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1428 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1431 *dst++ = ISO_CODE_SS3; \
1432 coding->fake_multibyte = 1; \
1434 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1437 /* The following four macros produce codes (control character or
1438 escape sequence) for ISO2022 locking-shift functions (shift-in,
1439 shift-out, locking-shift-2, and locking-shift-3). */
1441 #define ENCODE_SHIFT_IN \
1443 *dst++ = ISO_CODE_SI; \
1444 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1447 #define ENCODE_SHIFT_OUT \
1449 *dst++ = ISO_CODE_SO; \
1450 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1453 #define ENCODE_LOCKING_SHIFT_2 \
1455 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1459 #define ENCODE_LOCKING_SHIFT_3 \
1461 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1462 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1465 /* Produce codes for a DIMENSION1 character whose character set is
1466 CHARSET and whose position-code is C1. Designation and invocation
1467 sequences are also produced in advance if necessary. */
1470 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1472 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1474 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1475 *dst++ = c1 & 0x7F; \
1477 *dst++ = c1 | 0x80; \
1478 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1481 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1483 *dst++ = c1 & 0x7F; \
1486 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1488 *dst++ = c1 | 0x80; \
1491 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1492 && !coding->safe_charsets[charset]) \
1494 /* We should not encode this character, instead produce one or \
1496 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1497 if (CHARSET_WIDTH (charset) == 2) \
1498 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1502 /* Since CHARSET is not yet invoked to any graphic planes, we \
1503 must invoke it, or, at first, designate it to some graphic \
1504 register. Then repeat the loop to actually produce the \
1506 dst = encode_invocation_designation (charset, coding, dst); \
1509 /* Produce codes for a DIMENSION2 character whose character set is
1510 CHARSET and whose position-codes are C1 and C2. Designation and
1511 invocation codes are also produced in advance if necessary. */
1513 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1515 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1517 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1518 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1520 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1521 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1524 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1526 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1531 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1534 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1535 && !coding->safe_charsets[charset]) \
1537 /* We should not encode this character, instead produce one or \
1539 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1540 if (CHARSET_WIDTH (charset) == 2) \
1541 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1545 /* Since CHARSET is not yet invoked to any graphic planes, we \
1546 must invoke it, or, at first, designate it to some graphic \
1547 register. Then repeat the loop to actually produce the \
1549 dst = encode_invocation_designation (charset, coding, dst); \
1552 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1554 int c_alt, charset_alt; \
1555 if (!NILP (unification_table) \
1556 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1558 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1560 charset_alt = charset; \
1561 if (CHARSET_DIMENSION (charset_alt) == 1) \
1563 if (charset == CHARSET_ASCII \
1564 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1565 charset_alt = charset_latin_jisx0201; \
1566 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1570 if (charset == charset_jisx0208 \
1571 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1572 charset_alt = charset_jisx0208_1978; \
1573 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1575 if (! COMPOSING_P (coding->composing)) \
1576 coding->consumed_char++; \
1579 /* Produce designation and invocation codes at a place pointed by DST
1580 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1584 encode_invocation_designation (charset
, coding
, dst
)
1586 struct coding_system
*coding
;
1589 int reg
; /* graphic register number */
1591 /* At first, check designations. */
1592 for (reg
= 0; reg
< 4; reg
++)
1593 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1598 /* CHARSET is not yet designated to any graphic registers. */
1599 /* At first check the requested designation. */
1600 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1601 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1602 /* Since CHARSET requests no special designation, designate it
1603 to graphic register 0. */
1606 ENCODE_DESIGNATION (charset
, reg
, coding
);
1609 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1610 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1612 /* Since the graphic register REG is not invoked to any graphic
1613 planes, invoke it to graphic plane 0. */
1616 case 0: /* graphic register 0 */
1620 case 1: /* graphic register 1 */
1624 case 2: /* graphic register 2 */
1625 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1626 ENCODE_SINGLE_SHIFT_2
;
1628 ENCODE_LOCKING_SHIFT_2
;
1631 case 3: /* graphic register 3 */
1632 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1633 ENCODE_SINGLE_SHIFT_3
;
1635 ENCODE_LOCKING_SHIFT_3
;
1642 /* The following two macros produce codes for indicating composition. */
1643 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1644 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1645 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1647 /* The following three macros produce codes for indicating direction
1649 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1651 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1652 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1654 *dst++ = ISO_CODE_CSI; \
1657 #define ENCODE_DIRECTION_R2L \
1658 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1660 #define ENCODE_DIRECTION_L2R \
1661 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1663 /* Produce codes for designation and invocation to reset the graphic
1664 planes and registers to initial state. */
1665 #define ENCODE_RESET_PLANE_AND_REGISTER \
1668 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1670 for (reg = 0; reg < 4; reg++) \
1671 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1672 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1673 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1674 ENCODE_DESIGNATION \
1675 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1678 /* Produce designation sequences of charsets in the line started from
1679 SRC to a place pointed by *DSTP, and update DSTP.
1681 If the current block ends before any end-of-line, we may fail to
1682 find all the necessary designations. */
1684 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1685 struct coding_system
*coding
;
1687 unsigned char *src
, *src_end
, **dstp
;
1689 int charset
, c
, found
= 0, reg
;
1690 /* Table of charsets to be designated to each graphic register. */
1692 unsigned char *dst
= *dstp
;
1694 for (reg
= 0; reg
< 4; reg
++)
1697 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1699 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1702 charset
= CHARSET_AT (src
);
1706 unsigned char c1
, c2
;
1708 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1709 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1710 charset
= CHAR_CHARSET (c_alt
);
1713 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1714 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1725 for (reg
= 0; reg
< 4; reg
++)
1727 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1728 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1733 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1736 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1737 struct coding_system
*coding
;
1738 unsigned char *source
, *destination
;
1739 int src_bytes
, dst_bytes
;
1741 unsigned char *src
= source
;
1742 unsigned char *src_end
= source
+ src_bytes
;
1743 unsigned char *dst
= destination
;
1744 unsigned char *dst_end
= destination
+ dst_bytes
;
1745 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1746 from DST_END to assure overflow checking is necessary only at the
1748 unsigned char *adjusted_dst_end
= dst_end
- 19;
1749 Lisp_Object unification_table
1750 = coding
->character_unification_table_for_encode
;
1751 int result
= CODING_FINISH_NORMAL
;
1753 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1754 unification_table
= Vstandard_character_unification_table_for_encode
;
1756 coding
->consumed_char
= 0;
1757 coding
->fake_multibyte
= 0;
1758 while (src
< src_end
&& (dst_bytes
1759 ? (dst
< adjusted_dst_end
)
1760 : (dst
< src
- 19)))
1762 /* SRC_BASE remembers the start position in source in each loop.
1763 The loop will be exited when there's not enough source text
1764 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1765 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1766 reset to SRC_BASE before exiting. */
1767 unsigned char *src_base
= src
;
1768 int charset
, c1
, c2
, c3
, c4
;
1770 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1771 && CODING_SPEC_ISO_BOL (coding
))
1773 /* We have to produce designation sequences if any now. */
1774 encode_designation_at_bol (coding
, unification_table
,
1775 src
, src_end
, &dst
);
1776 CODING_SPEC_ISO_BOL (coding
) = 0;
1780 /* If we are seeing a component of a composite character, we are
1781 seeing a leading-code encoded irregularly for composition, or
1782 a composition rule if composing with rule. We must set C1 to
1783 a normal leading-code or an ASCII code. If we are not seeing
1784 a composite character, we must reset composition,
1785 designation, and invocation states. */
1786 if (COMPOSING_P (coding
->composing
))
1790 /* We are not in a composite character any longer. */
1791 coding
->composing
= COMPOSING_NO
;
1792 ENCODE_RESET_PLANE_AND_REGISTER
;
1793 ENCODE_COMPOSITION_END
;
1797 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1800 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1803 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1804 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1807 /* This is an ASCII component. */
1812 /* This is a leading-code of non ASCII component. */
1817 /* Now encode one character. C1 is a control character, an
1818 ASCII character, or a leading-code of multi-byte character. */
1819 switch (emacs_code_class
[c1
])
1821 case EMACS_ascii_code
:
1822 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1825 case EMACS_control_code
:
1826 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1827 ENCODE_RESET_PLANE_AND_REGISTER
;
1829 coding
->consumed_char
++;
1832 case EMACS_carriage_return_code
:
1833 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1835 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1836 ENCODE_RESET_PLANE_AND_REGISTER
;
1838 coding
->consumed_char
++;
1841 /* fall down to treat '\r' as '\n' ... */
1843 case EMACS_linefeed_code
:
1844 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1845 ENCODE_RESET_PLANE_AND_REGISTER
;
1846 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1847 bcopy (coding
->spec
.iso2022
.initial_designation
,
1848 coding
->spec
.iso2022
.current_designation
,
1849 sizeof coding
->spec
.iso2022
.initial_designation
);
1850 if (coding
->eol_type
== CODING_EOL_LF
1851 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1852 *dst
++ = ISO_CODE_LF
;
1853 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1854 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1856 *dst
++ = ISO_CODE_CR
;
1857 CODING_SPEC_ISO_BOL (coding
) = 1;
1858 coding
->consumed_char
++;
1861 case EMACS_leading_code_2
:
1865 /* invalid sequence */
1868 coding
->consumed_char
+= 2;
1871 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1874 case EMACS_leading_code_3
:
1875 TWO_MORE_BYTES (c2
, c3
);
1876 if (c2
< 0xA0 || c3
< 0xA0)
1878 /* invalid sequence */
1882 coding
->consumed_char
+= 3;
1884 else if (c1
< LEADING_CODE_PRIVATE_11
)
1885 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1887 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1890 case EMACS_leading_code_4
:
1891 THREE_MORE_BYTES (c2
, c3
, c4
);
1892 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1894 /* invalid sequence */
1899 coding
->consumed_char
+= 4;
1902 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1905 case EMACS_leading_code_composition
:
1909 /* invalid sequence */
1912 coding
->consumed_char
+= 2;
1914 else if (c2
== 0xFF)
1916 ENCODE_RESET_PLANE_AND_REGISTER
;
1917 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1918 ENCODE_COMPOSITION_WITH_RULE_START
;
1919 coding
->consumed_char
++;
1923 ENCODE_RESET_PLANE_AND_REGISTER
;
1924 /* Rewind one byte because it is a character code of
1925 composition elements. */
1927 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1928 ENCODE_COMPOSITION_NO_RULE_START
;
1929 coding
->consumed_char
++;
1933 case EMACS_invalid_code
:
1935 coding
->consumed_char
++;
1940 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1947 if (result
== CODING_FINISH_NORMAL
)
1948 result
= CODING_FINISH_INSUFFICIENT_DST
;
1950 /* If this is the last block of the text to be encoded, we
1951 must reset graphic planes and registers to the initial
1952 state, and flush out the carryover if any. */
1953 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
1954 ENCODE_RESET_PLANE_AND_REGISTER
;
1957 coding
->consumed
= src
- source
;
1958 coding
->produced
= coding
->produced_char
= dst
- destination
;
1963 /*** 4. SJIS and BIG5 handlers ***/
1965 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1966 quite widely. So, for the moment, Emacs supports them in the bare
1967 C code. But, in the future, they may be supported only by CCL. */
1969 /* SJIS is a coding system encoding three character sets: ASCII, right
1970 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1971 as is. A character of charset katakana-jisx0201 is encoded by
1972 "position-code + 0x80". A character of charset japanese-jisx0208
1973 is encoded in 2-byte but two position-codes are divided and shifted
1974 so that it fit in the range below.
1976 --- CODE RANGE of SJIS ---
1977 (character set) (range)
1979 KATAKANA-JISX0201 0xA0 .. 0xDF
1980 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1981 (2nd byte) 0x40 .. 0xFF
1982 -------------------------------
1986 /* BIG5 is a coding system encoding two character sets: ASCII and
1987 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1988 character set and is encoded in two-byte.
1990 --- CODE RANGE of BIG5 ---
1991 (character set) (range)
1993 Big5 (1st byte) 0xA1 .. 0xFE
1994 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1995 --------------------------
1997 Since the number of characters in Big5 is larger than maximum
1998 characters in Emacs' charset (96x96), it can't be handled as one
1999 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2000 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2001 contains frequently used characters and the latter contains less
2002 frequently used characters. */
2004 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2005 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2006 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2007 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2009 /* Number of Big5 characters which have the same code in 1st byte. */
2010 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2012 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2015 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2017 charset = charset_big5_1; \
2020 charset = charset_big5_2; \
2021 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2023 c1 = temp / (0xFF - 0xA1) + 0x21; \
2024 c2 = temp % (0xFF - 0xA1) + 0x21; \
2027 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2029 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2030 if (charset == charset_big5_2) \
2031 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2032 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2033 b2 = temp % BIG5_SAME_ROW; \
2034 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2037 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2039 int c_alt, charset_alt = (charset); \
2040 if (!NILP (unification_table) \
2041 && ((c_alt = unify_char (unification_table, \
2042 -1, (charset), c1, c2)) >= 0)) \
2043 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2044 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2045 DECODE_CHARACTER_ASCII (c1); \
2046 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2047 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2049 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2052 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2054 int c_alt, charset_alt; \
2055 if (!NILP (unification_table) \
2056 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2058 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2060 charset_alt = charset; \
2061 if (charset_alt == charset_ascii) \
2063 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2065 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2069 *dst++ = charset_alt, *dst++ = c1; \
2070 coding->fake_multibyte = 1; \
2075 c1 &= 0x7F, c2 &= 0x7F; \
2076 if (sjis_p && charset_alt == charset_jisx0208) \
2078 unsigned char s1, s2; \
2080 ENCODE_SJIS (c1, c2, s1, s2); \
2081 *dst++ = s1, *dst++ = s2; \
2082 coding->fake_multibyte = 1; \
2085 && (charset_alt == charset_big5_1 \
2086 || charset_alt == charset_big5_2)) \
2088 unsigned char b1, b2; \
2090 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2091 *dst++ = b1, *dst++ = b2; \
2095 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2096 coding->fake_multibyte = 1; \
2099 coding->consumed_char++; \
2102 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2103 Check if a text is encoded in SJIS. If it is, return
2104 CODING_CATEGORY_MASK_SJIS, else return 0. */
2107 detect_coding_sjis (src
, src_end
)
2108 unsigned char *src
, *src_end
;
2112 while (src
< src_end
)
2115 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2117 if (src
< src_end
&& *src
++ < 0x40)
2121 return CODING_CATEGORY_MASK_SJIS
;
2124 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2125 Check if a text is encoded in BIG5. If it is, return
2126 CODING_CATEGORY_MASK_BIG5, else return 0. */
2129 detect_coding_big5 (src
, src_end
)
2130 unsigned char *src
, *src_end
;
2134 while (src
< src_end
)
2142 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2146 return CODING_CATEGORY_MASK_BIG5
;
2149 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2150 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2153 decode_coding_sjis_big5 (coding
, source
, destination
,
2154 src_bytes
, dst_bytes
, sjis_p
)
2155 struct coding_system
*coding
;
2156 unsigned char *source
, *destination
;
2157 int src_bytes
, dst_bytes
;
2160 unsigned char *src
= source
;
2161 unsigned char *src_end
= source
+ src_bytes
;
2162 unsigned char *dst
= destination
;
2163 unsigned char *dst_end
= destination
+ dst_bytes
;
2164 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2165 from DST_END to assure overflow checking is necessary only at the
2167 unsigned char *adjusted_dst_end
= dst_end
- 3;
2168 Lisp_Object unification_table
2169 = coding
->character_unification_table_for_decode
;
2170 int result
= CODING_FINISH_NORMAL
;
2172 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
2173 unification_table
= Vstandard_character_unification_table_for_decode
;
2175 coding
->produced_char
= 0;
2176 coding
->fake_multibyte
= 0;
2177 while (src
< src_end
&& (dst_bytes
2178 ? (dst
< adjusted_dst_end
)
2181 /* SRC_BASE remembers the start position in source in each loop.
2182 The loop will be exited when there's not enough source text
2183 to analyze two-byte character (within macro ONE_MORE_BYTE).
2184 In that case, SRC is reset to SRC_BASE before exiting. */
2185 unsigned char *src_base
= src
;
2186 unsigned char c1
= *src
++, c2
, c3
, c4
;
2192 if (coding
->eol_type
== CODING_EOL_CRLF
)
2197 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2199 result
= CODING_FINISH_INCONSISTENT_EOL
;
2200 goto label_end_of_loop_2
;
2203 /* To process C2 again, SRC is subtracted by 1. */
2206 else if (coding
->eol_type
== CODING_EOL_CR
)
2212 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2213 && (coding
->eol_type
== CODING_EOL_CR
2214 || coding
->eol_type
== CODING_EOL_CRLF
))
2216 result
= CODING_FINISH_INCONSISTENT_EOL
;
2217 goto label_end_of_loop_2
;
2221 coding
->produced_char
++;
2224 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2227 /* SJIS -> JISX0208 */
2233 DECODE_SJIS (c1
, c2
, c3
, c4
);
2234 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2237 goto label_invalid_code_2
;
2240 goto label_invalid_code_1
;
2244 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2246 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2253 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2255 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2256 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2259 goto label_invalid_code_2
;
2262 else /* C1 >= 0xE0 */
2264 /* SJIS -> JISX0208, BIG5 -> Big5 */
2270 DECODE_SJIS (c1
, c2
, c3
, c4
);
2271 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2274 goto label_invalid_code_2
;
2281 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2283 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2284 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2287 goto label_invalid_code_2
;
2292 label_invalid_code_1
:
2294 coding
->produced_char
++;
2295 coding
->fake_multibyte
= 1;
2298 label_invalid_code_2
:
2299 *dst
++ = c1
; *dst
++= c2
;
2300 coding
->produced_char
+= 2;
2301 coding
->fake_multibyte
= 1;
2305 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2306 label_end_of_loop_2
:
2313 if (result
== CODING_FINISH_NORMAL
)
2314 result
= CODING_FINISH_INSUFFICIENT_DST
;
2315 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2316 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2318 src_bytes
= src_end
- src
;
2319 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2320 src_bytes
= dst_end
- dst
;
2321 bcopy (dst
, src
, src_bytes
);
2324 coding
->fake_multibyte
= 1;
2328 coding
->consumed
= coding
->consumed_char
= src
- source
;
2329 coding
->produced
= dst
- destination
;
2333 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2334 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2335 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2336 sure that all these charsets are registered as official charset
2337 (i.e. do not have extended leading-codes). Characters of other
2338 charsets are produced without any encoding. If SJIS_P is 1, encode
2339 SJIS text, else encode BIG5 text. */
2342 encode_coding_sjis_big5 (coding
, source
, destination
,
2343 src_bytes
, dst_bytes
, sjis_p
)
2344 struct coding_system
*coding
;
2345 unsigned char *source
, *destination
;
2346 int src_bytes
, dst_bytes
;
2349 unsigned char *src
= source
;
2350 unsigned char *src_end
= source
+ src_bytes
;
2351 unsigned char *dst
= destination
;
2352 unsigned char *dst_end
= destination
+ dst_bytes
;
2353 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2354 from DST_END to assure overflow checking is necessary only at the
2356 unsigned char *adjusted_dst_end
= dst_end
- 1;
2357 Lisp_Object unification_table
2358 = coding
->character_unification_table_for_encode
;
2359 int result
= CODING_FINISH_NORMAL
;
2361 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
2362 unification_table
= Vstandard_character_unification_table_for_encode
;
2364 coding
->consumed_char
= 0;
2365 coding
->fake_multibyte
= 0;
2366 while (src
< src_end
&& (dst_bytes
2367 ? (dst
< adjusted_dst_end
)
2370 /* SRC_BASE remembers the start position in source in each loop.
2371 The loop will be exited when there's not enough source text
2372 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2373 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2375 unsigned char *src_base
= src
;
2376 unsigned char c1
= *src
++, c2
, c3
, c4
;
2378 if (coding
->composing
)
2385 else if (c1
>= 0xA0)
2388 coding
->composing
= 0;
2391 switch (emacs_code_class
[c1
])
2393 case EMACS_ascii_code
:
2394 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2397 case EMACS_control_code
:
2399 coding
->consumed_char
++;
2402 case EMACS_carriage_return_code
:
2403 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2406 coding
->consumed_char
++;
2409 /* fall down to treat '\r' as '\n' ... */
2411 case EMACS_linefeed_code
:
2412 if (coding
->eol_type
== CODING_EOL_LF
2413 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2415 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2416 *dst
++ = '\r', *dst
++ = '\n';
2419 coding
->consumed_char
++;
2422 case EMACS_leading_code_2
:
2424 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2427 case EMACS_leading_code_3
:
2428 TWO_MORE_BYTES (c2
, c3
);
2429 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2432 case EMACS_leading_code_4
:
2433 THREE_MORE_BYTES (c2
, c3
, c4
);
2434 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2437 case EMACS_leading_code_composition
:
2438 coding
->composing
= 1;
2441 default: /* i.e. case EMACS_invalid_code: */
2443 coding
->consumed_char
++;
2448 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2453 if (result
== CODING_FINISH_NORMAL
2455 result
= CODING_FINISH_INSUFFICIENT_DST
;
2456 coding
->consumed
= src
- source
;
2457 coding
->produced
= coding
->produced_char
= dst
- destination
;
2462 /*** 5. End-of-line handlers ***/
2464 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2465 This function is called only when `coding->eol_type' is
2466 CODING_EOL_CRLF or CODING_EOL_CR. */
2468 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2469 struct coding_system
*coding
;
2470 unsigned char *source
, *destination
;
2471 int src_bytes
, dst_bytes
;
2473 unsigned char *src
= source
;
2474 unsigned char *src_end
= source
+ src_bytes
;
2475 unsigned char *dst
= destination
;
2476 unsigned char *dst_end
= destination
+ dst_bytes
;
2478 int result
= CODING_FINISH_NORMAL
;
2480 coding
->fake_multibyte
= 0;
2485 switch (coding
->eol_type
)
2487 case CODING_EOL_CRLF
:
2489 /* Since the maximum bytes produced by each loop is 2, we
2490 subtract 1 from DST_END to assure overflow checking is
2491 necessary only at the head of loop. */
2492 unsigned char *adjusted_dst_end
= dst_end
- 1;
2494 while (src
< src_end
&& (dst_bytes
2495 ? (dst
< adjusted_dst_end
)
2498 unsigned char *src_base
= src
;
2506 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2508 result
= CODING_FINISH_INCONSISTENT_EOL
;
2509 goto label_end_of_loop_2
;
2512 if (BASE_LEADING_CODE_P (c
))
2513 coding
->fake_multibyte
= 1;
2518 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2520 result
= CODING_FINISH_INCONSISTENT_EOL
;
2521 goto label_end_of_loop_2
;
2526 if (BASE_LEADING_CODE_P (c
))
2527 coding
->fake_multibyte
= 1;
2532 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2533 label_end_of_loop_2
:
2537 if (result
== CODING_FINISH_NORMAL
2539 result
= CODING_FINISH_INSUFFICIENT_DST
;
2544 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2546 while (src
< src_end
)
2548 if ((c
= *src
++) == '\n')
2550 if (BASE_LEADING_CODE_P (c
))
2551 coding
->fake_multibyte
= 1;
2555 src_bytes
= src
- source
;
2556 result
= CODING_FINISH_INCONSISTENT_EOL
;
2559 if (dst_bytes
&& src_bytes
> dst_bytes
)
2561 result
= CODING_FINISH_INSUFFICIENT_DST
;
2562 src_bytes
= dst_bytes
;
2565 bcopy (source
, destination
, src_bytes
);
2567 safe_bcopy (source
, destination
, src_bytes
);
2568 src
= source
+ src_bytes
;
2569 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2572 default: /* i.e. case: CODING_EOL_LF */
2573 if (dst_bytes
&& src_bytes
> dst_bytes
)
2575 result
= CODING_FINISH_INSUFFICIENT_DST
;
2576 src_bytes
= dst_bytes
;
2579 bcopy (source
, destination
, src_bytes
);
2581 safe_bcopy (source
, destination
, src_bytes
);
2584 coding
->fake_multibyte
= 1;
2588 coding
->consumed
= coding
->consumed_char
= src
- source
;
2589 coding
->produced
= coding
->produced_char
= dst
- destination
;
2593 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2594 format of end-of-line according to `coding->eol_type'. If
2595 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2596 '\r' in source text also means end-of-line. */
2598 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2599 struct coding_system
*coding
;
2600 unsigned char *source
, *destination
;
2601 int src_bytes
, dst_bytes
;
2603 unsigned char *src
= source
;
2604 unsigned char *dst
= destination
;
2605 int result
= CODING_FINISH_NORMAL
;
2607 coding
->fake_multibyte
= 0;
2609 if (coding
->eol_type
== CODING_EOL_CRLF
)
2612 unsigned char *src_end
= source
+ src_bytes
;
2613 unsigned char *dst_end
= destination
+ dst_bytes
;
2614 /* Since the maximum bytes produced by each loop is 2, we
2615 subtract 1 from DST_END to assure overflow checking is
2616 necessary only at the head of loop. */
2617 unsigned char *adjusted_dst_end
= dst_end
- 1;
2619 while (src
< src_end
&& (dst_bytes
2620 ? (dst
< adjusted_dst_end
)
2625 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2626 *dst
++ = '\r', *dst
++ = '\n';
2630 if (BASE_LEADING_CODE_P (c
))
2631 coding
->fake_multibyte
= 1;
2635 result
= CODING_FINISH_INSUFFICIENT_DST
;
2641 if (dst_bytes
&& src_bytes
> dst_bytes
)
2643 src_bytes
= dst_bytes
;
2644 result
= CODING_FINISH_INSUFFICIENT_DST
;
2647 bcopy (source
, destination
, src_bytes
);
2650 safe_bcopy (source
, destination
, src_bytes
);
2651 dst_bytes
= src_bytes
;
2653 if (coding
->eol_type
== CODING_EOL_CRLF
)
2657 if ((c
= *dst
++) == '\n')
2659 else if (BASE_LEADING_CODE_P (c
))
2660 coding
->fake_multibyte
= 1;
2665 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2668 if (*dst
++ == '\r') dst
[-1] = '\n';
2670 coding
->fake_multibyte
= 1;
2672 src
= source
+ dst_bytes
;
2673 dst
= destination
+ dst_bytes
;
2676 coding
->consumed
= coding
->consumed_char
= src
- source
;
2677 coding
->produced
= coding
->produced_char
= dst
- destination
;
2682 /*** 6. C library functions ***/
2684 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2685 has a property `coding-system'. The value of this property is a
2686 vector of length 5 (called as coding-vector). Among elements of
2687 this vector, the first (element[0]) and the fifth (element[4])
2688 carry important information for decoding/encoding. Before
2689 decoding/encoding, this information should be set in fields of a
2690 structure of type `coding_system'.
2692 A value of property `coding-system' can be a symbol of another
2693 subsidiary coding-system. In that case, Emacs gets coding-vector
2696 `element[0]' contains information to be set in `coding->type'. The
2697 value and its meaning is as follows:
2699 0 -- coding_type_emacs_mule
2700 1 -- coding_type_sjis
2701 2 -- coding_type_iso2022
2702 3 -- coding_type_big5
2703 4 -- coding_type_ccl encoder/decoder written in CCL
2704 nil -- coding_type_no_conversion
2705 t -- coding_type_undecided (automatic conversion on decoding,
2706 no-conversion on encoding)
2708 `element[4]' contains information to be set in `coding->flags' and
2709 `coding->spec'. The meaning varies by `coding->type'.
2711 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2712 of length 32 (of which the first 13 sub-elements are used now).
2713 Meanings of these sub-elements are:
2715 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2716 If the value is an integer of valid charset, the charset is
2717 assumed to be designated to graphic register N initially.
2719 If the value is minus, it is a minus value of charset which
2720 reserves graphic register N, which means that the charset is
2721 not designated initially but should be designated to graphic
2722 register N just before encoding a character in that charset.
2724 If the value is nil, graphic register N is never used on
2727 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2728 Each value takes t or nil. See the section ISO2022 of
2729 `coding.h' for more information.
2731 If `coding->type' is `coding_type_big5', element[4] is t to denote
2732 BIG5-ETen or nil to denote BIG5-HKU.
2734 If `coding->type' takes the other value, element[4] is ignored.
2736 Emacs Lisp's coding system also carries information about format of
2737 end-of-line in a value of property `eol-type'. If the value is
2738 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2739 means CODING_EOL_CR. If it is not integer, it should be a vector
2740 of subsidiary coding systems of which property `eol-type' has one
2745 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2746 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2747 is setup so that no conversion is necessary and return -1, else
2751 setup_coding_system (coding_system
, coding
)
2752 Lisp_Object coding_system
;
2753 struct coding_system
*coding
;
2755 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2759 /* Initialize some fields required for all kinds of coding systems. */
2760 coding
->symbol
= coding_system
;
2761 coding
->common_flags
= 0;
2763 coding
->heading_ascii
= -1;
2764 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2765 coding_spec
= Fget (coding_system
, Qcoding_system
);
2766 if (!VECTORP (coding_spec
)
2767 || XVECTOR (coding_spec
)->size
!= 5
2768 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2769 goto label_invalid_coding_system
;
2771 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2772 if (VECTORP (eol_type
))
2774 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2775 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2777 else if (XFASTINT (eol_type
) == 1)
2779 coding
->eol_type
= CODING_EOL_CRLF
;
2780 coding
->common_flags
2781 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2783 else if (XFASTINT (eol_type
) == 2)
2785 coding
->eol_type
= CODING_EOL_CR
;
2786 coding
->common_flags
2787 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2790 coding
->eol_type
= CODING_EOL_LF
;
2792 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2793 /* Try short cut. */
2794 if (SYMBOLP (coding_type
))
2796 if (EQ (coding_type
, Qt
))
2798 coding
->type
= coding_type_undecided
;
2799 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2802 coding
->type
= coding_type_no_conversion
;
2806 /* Initialize remaining fields. */
2807 coding
->composing
= 0;
2808 coding
->character_unification_table_for_decode
= Qnil
;
2809 coding
->character_unification_table_for_encode
= Qnil
;
2811 /* Get values of coding system properties:
2812 `post-read-conversion', `pre-write-conversion',
2813 `character-unification-table-for-decode',
2814 `character-unification-table-for-encode'. */
2815 plist
= XVECTOR (coding_spec
)->contents
[3];
2816 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2817 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2818 val
= Fplist_get (plist
, Qcharacter_unification_table_for_decode
);
2820 val
= Fget (val
, Qcharacter_unification_table_for_decode
);
2821 coding
->character_unification_table_for_decode
2822 = CHAR_TABLE_P (val
) ? val
: Qnil
;
2823 val
= Fplist_get (plist
, Qcharacter_unification_table_for_encode
);
2825 val
= Fget (val
, Qcharacter_unification_table_for_encode
);
2826 coding
->character_unification_table_for_encode
2827 = CHAR_TABLE_P (val
) ? val
: Qnil
;
2828 val
= Fplist_get (plist
, Qcoding_category
);
2831 val
= Fget (val
, Qcoding_category_index
);
2833 coding
->category_idx
= XINT (val
);
2835 goto label_invalid_coding_system
;
2838 goto label_invalid_coding_system
;
2840 val
= Fplist_get (plist
, Qsafe_charsets
);
2843 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2844 coding
->safe_charsets
[i
] = 1;
2848 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2851 if ((i
= get_charset_id (XCONS (val
)->car
)) >= 0)
2852 coding
->safe_charsets
[i
] = 1;
2853 val
= XCONS (val
)->cdr
;
2857 switch (XFASTINT (coding_type
))
2860 coding
->type
= coding_type_emacs_mule
;
2861 if (!NILP (coding
->post_read_conversion
))
2862 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2863 if (!NILP (coding
->pre_write_conversion
))
2864 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2868 coding
->type
= coding_type_sjis
;
2869 coding
->common_flags
2870 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2874 coding
->type
= coding_type_iso2022
;
2875 coding
->common_flags
2876 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2878 Lisp_Object val
, temp
;
2880 int i
, charset
, reg_bits
= 0;
2882 val
= XVECTOR (coding_spec
)->contents
[4];
2884 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2885 goto label_invalid_coding_system
;
2887 flags
= XVECTOR (val
)->contents
;
2889 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2890 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2891 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2892 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2893 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2894 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2895 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2896 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2897 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2898 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2899 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2900 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2901 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2904 /* Invoke graphic register 0 to plane 0. */
2905 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2906 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2907 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2908 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2909 /* Not single shifting at first. */
2910 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2911 /* Beginning of buffer should also be regarded as bol. */
2912 CODING_SPEC_ISO_BOL (coding
) = 1;
2914 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2915 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
2916 val
= Vcharset_revision_alist
;
2919 charset
= get_charset_id (Fcar_safe (XCONS (val
)->car
));
2921 && (temp
= Fcdr_safe (XCONS (val
)->car
), INTEGERP (temp
))
2922 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
2923 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
2924 val
= XCONS (val
)->cdr
;
2927 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2928 FLAGS[REG] can be one of below:
2929 integer CHARSET: CHARSET occupies register I,
2930 t: designate nothing to REG initially, but can be used
2932 list of integer, nil, or t: designate the first
2933 element (if integer) to REG initially, the remaining
2934 elements (if integer) is designated to REG on request,
2935 if an element is t, REG can be used by any charsets,
2936 nil: REG is never used. */
2937 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2938 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2939 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2940 for (i
= 0; i
< 4; i
++)
2942 if (INTEGERP (flags
[i
])
2943 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2944 || (charset
= get_charset_id (flags
[i
])) >= 0)
2946 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2947 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2949 else if (EQ (flags
[i
], Qt
))
2951 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2953 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
2955 else if (CONSP (flags
[i
]))
2957 Lisp_Object tail
= flags
[i
];
2959 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
2960 if (INTEGERP (XCONS (tail
)->car
)
2961 && (charset
= XINT (XCONS (tail
)->car
),
2962 CHARSET_VALID_P (charset
))
2963 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2965 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2966 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2969 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2970 tail
= XCONS (tail
)->cdr
;
2971 while (CONSP (tail
))
2973 if (INTEGERP (XCONS (tail
)->car
)
2974 && (charset
= XINT (XCONS (tail
)->car
),
2975 CHARSET_VALID_P (charset
))
2976 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2977 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2979 else if (EQ (XCONS (tail
)->car
, Qt
))
2981 tail
= XCONS (tail
)->cdr
;
2985 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2987 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2988 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2991 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2993 /* REG 1 can be used only by locking shift in 7-bit env. */
2994 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2996 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2997 /* Without any shifting, only REG 0 and 1 can be used. */
3002 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3004 if (CHARSET_VALID_P (charset
))
3006 /* There exist some default graphic registers to be
3009 /* We had better avoid designating a charset of
3010 CHARS96 to REG 0 as far as possible. */
3011 if (CHARSET_CHARS (charset
) == 96)
3012 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3014 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3016 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3018 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3022 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3023 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3027 coding
->type
= coding_type_big5
;
3028 coding
->common_flags
3029 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3031 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3032 ? CODING_FLAG_BIG5_HKU
3033 : CODING_FLAG_BIG5_ETEN
);
3037 coding
->type
= coding_type_ccl
;
3038 coding
->common_flags
3039 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3041 Lisp_Object val
= XVECTOR (coding_spec
)->contents
[4];
3043 && VECTORP (XCONS (val
)->car
)
3044 && VECTORP (XCONS (val
)->cdr
))
3046 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
3047 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
3050 goto label_invalid_coding_system
;
3052 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3056 coding
->type
= coding_type_raw_text
;
3060 goto label_invalid_coding_system
;
3064 label_invalid_coding_system
:
3065 coding
->type
= coding_type_no_conversion
;
3066 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3067 coding
->common_flags
= 0;
3068 coding
->eol_type
= CODING_EOL_LF
;
3069 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3073 /* Emacs has a mechanism to automatically detect a coding system if it
3074 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3075 it's impossible to distinguish some coding systems accurately
3076 because they use the same range of codes. So, at first, coding
3077 systems are categorized into 7, those are:
3079 o coding-category-emacs-mule
3081 The category for a coding system which has the same code range
3082 as Emacs' internal format. Assigned the coding-system (Lisp
3083 symbol) `emacs-mule' by default.
3085 o coding-category-sjis
3087 The category for a coding system which has the same code range
3088 as SJIS. Assigned the coding-system (Lisp
3089 symbol) `japanese-shift-jis' by default.
3091 o coding-category-iso-7
3093 The category for a coding system which has the same code range
3094 as ISO2022 of 7-bit environment. This doesn't use any locking
3095 shift and single shift functions. This can encode/decode all
3096 charsets. Assigned the coding-system (Lisp symbol)
3097 `iso-2022-7bit' by default.
3099 o coding-category-iso-7-tight
3101 Same as coding-category-iso-7 except that this can
3102 encode/decode only the specified charsets.
3104 o coding-category-iso-8-1
3106 The category for a coding system which has the same code range
3107 as ISO2022 of 8-bit environment and graphic plane 1 used only
3108 for DIMENSION1 charset. This doesn't use any locking shift
3109 and single shift functions. Assigned the coding-system (Lisp
3110 symbol) `iso-latin-1' by default.
3112 o coding-category-iso-8-2
3114 The category for a coding system which has the same code range
3115 as ISO2022 of 8-bit environment and graphic plane 1 used only
3116 for DIMENSION2 charset. This doesn't use any locking shift
3117 and single shift functions. Assigned the coding-system (Lisp
3118 symbol) `japanese-iso-8bit' by default.
3120 o coding-category-iso-7-else
3122 The category for a coding system which has the same code range
3123 as ISO2022 of 7-bit environemnt but uses locking shift or
3124 single shift functions. Assigned the coding-system (Lisp
3125 symbol) `iso-2022-7bit-lock' by default.
3127 o coding-category-iso-8-else
3129 The category for a coding system which has the same code range
3130 as ISO2022 of 8-bit environemnt but uses locking shift or
3131 single shift functions. Assigned the coding-system (Lisp
3132 symbol) `iso-2022-8bit-ss2' by default.
3134 o coding-category-big5
3136 The category for a coding system which has the same code range
3137 as BIG5. Assigned the coding-system (Lisp symbol)
3138 `cn-big5' by default.
3140 o coding-category-binary
3142 The category for a coding system not categorized in any of the
3143 above. Assigned the coding-system (Lisp symbol)
3144 `no-conversion' by default.
3146 Each of them is a Lisp symbol and the value is an actual
3147 `coding-system's (this is also a Lisp symbol) assigned by a user.
3148 What Emacs does actually is to detect a category of coding system.
3149 Then, it uses a `coding-system' assigned to it. If Emacs can't
3150 decide only one possible category, it selects a category of the
3151 highest priority. Priorities of categories are also specified by a
3152 user in a Lisp variable `coding-category-list'.
3156 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3157 If it detects possible coding systems, return an integer in which
3158 appropriate flag bits are set. Flag bits are defined by macros
3159 CODING_CATEGORY_MASK_XXX in `coding.h'.
3161 How many ASCII characters are at the head is returned as *SKIP. */
3164 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3165 unsigned char *source
;
3166 int src_bytes
, *priorities
, *skip
;
3168 register unsigned char c
;
3169 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3170 unsigned int mask
= (CODING_CATEGORY_MASK_ISO_7BIT
3171 | CODING_CATEGORY_MASK_ISO_SHIFT
);
3174 /* At first, skip all ASCII characters and control characters except
3175 for three ISO2022 specific control characters. */
3176 label_loop_detect_coding
:
3177 while (src
< src_end
)
3181 || ((mask
& CODING_CATEGORY_MASK_ISO_7BIT
)
3182 && c
== ISO_CODE_ESC
)
3183 || ((mask
& CODING_CATEGORY_MASK_ISO_SHIFT
)
3184 && (c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)))
3188 *skip
= src
- source
;
3191 /* We found nothing other than ASCII. There's nothing to do. */
3194 /* The text seems to be encoded in some multilingual coding system.
3195 Now, try to find in which coding system the text is encoded. */
3198 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3199 /* C is an ISO2022 specific control code of C0. */
3200 mask
= detect_coding_iso2022 (src
, src_end
);
3203 /* No valid ISO2022 code follows C. Try again. */
3205 mask
= (c
!= ISO_CODE_ESC
3206 ? CODING_CATEGORY_MASK_ISO_7BIT
3207 : CODING_CATEGORY_MASK_ISO_SHIFT
);
3208 goto label_loop_detect_coding
;
3211 goto label_return_highest_only
;
3219 /* C is the first byte of SJIS character code,
3220 or a leading-code of Emacs' internal format (emacs-mule). */
3221 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3223 /* Or, if C is a special latin extra code,
3224 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3225 or is an ISO2022 control-sequence-introducer (CSI),
3226 we should also consider the possibility of ISO2022 codings. */
3227 if ((VECTORP (Vlatin_extra_code_table
)
3228 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3229 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3230 || (c
== ISO_CODE_CSI
3233 || ((*src
== '0' || *src
== '1' || *src
== '2')
3234 && src
+ 1 < src_end
3235 && src
[1] == ']')))))
3236 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3237 | CODING_CATEGORY_MASK_ISO_8BIT
);
3240 /* C is a character of ISO2022 in graphic plane right,
3241 or a SJIS's 1-byte character code (i.e. JISX0201),
3242 or the first byte of BIG5's 2-byte code. */
3243 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3244 | CODING_CATEGORY_MASK_ISO_8BIT
3245 | CODING_CATEGORY_MASK_SJIS
3246 | CODING_CATEGORY_MASK_BIG5
);
3251 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3253 priorities
[i
] &= try;
3254 if (priorities
[i
] & CODING_CATEGORY_MASK_ISO
)
3255 mask
= detect_coding_iso2022 (src
, src_end
);
3256 else if (priorities
[i
] & CODING_CATEGORY_MASK_SJIS
)
3257 mask
= detect_coding_sjis (src
, src_end
);
3258 else if (priorities
[i
] & CODING_CATEGORY_MASK_BIG5
)
3259 mask
= detect_coding_big5 (src
, src_end
);
3260 else if (priorities
[i
] & CODING_CATEGORY_MASK_EMACS_MULE
)
3261 mask
= detect_coding_emacs_mule (src
, src_end
);
3263 goto label_return_highest_only
;
3265 return CODING_CATEGORY_MASK_RAW_TEXT
;
3267 if (try & CODING_CATEGORY_MASK_ISO
)
3268 mask
|= detect_coding_iso2022 (src
, src_end
);
3269 if (try & CODING_CATEGORY_MASK_SJIS
)
3270 mask
|= detect_coding_sjis (src
, src_end
);
3271 if (try & CODING_CATEGORY_MASK_BIG5
)
3272 mask
|= detect_coding_big5 (src
, src_end
);
3273 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3274 mask
|= detect_coding_emacs_mule (src
, src_end
);
3276 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
);
3278 label_return_highest_only
:
3279 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3281 if (mask
& priorities
[i
])
3282 return priorities
[i
];
3284 return CODING_CATEGORY_MASK_RAW_TEXT
;
3287 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3288 The information of the detected coding system is set in CODING. */
3291 detect_coding (coding
, src
, src_bytes
)
3292 struct coding_system
*coding
;
3298 int priorities
[CODING_CATEGORY_IDX_MAX
];
3299 Lisp_Object val
= Vcoding_category_list
;
3302 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
3304 if (! SYMBOLP (XCONS (val
)->car
))
3306 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
3307 if (idx
>= CODING_CATEGORY_IDX_MAX
)
3309 priorities
[i
++] = (1 << idx
);
3310 val
= XCONS (val
)->cdr
;
3312 /* If coding-category-list is valid and contains all coding
3313 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3314 the following code saves Emacs from craching. */
3315 while (i
< CODING_CATEGORY_IDX_MAX
)
3316 priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
3318 mask
= detect_coding_mask (src
, src_bytes
, priorities
, &skip
);
3319 coding
->heading_ascii
= skip
;
3323 /* We found a single coding system of the highest priority in MASK. */
3325 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3327 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3329 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3331 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3333 Lisp_Object tmp
= Fget (val
, Qeol_type
);
3336 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3338 setup_coding_system (val
, coding
);
3339 /* Set this again because setup_coding_system reset this member. */
3340 coding
->heading_ascii
= skip
;
3343 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3344 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3345 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3347 How many non-eol characters are at the head is returned as *SKIP. */
3349 #define MAX_EOL_CHECK_COUNT 3
3352 detect_eol_type (source
, src_bytes
, skip
)
3353 unsigned char *source
;
3354 int src_bytes
, *skip
;
3356 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3358 int total
= 0; /* How many end-of-lines are found so far. */
3359 int eol_type
= CODING_EOL_UNDECIDED
;
3364 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3367 if (c
== '\n' || c
== '\r')
3370 *skip
= src
- 1 - source
;
3373 this_eol_type
= CODING_EOL_LF
;
3374 else if (src
>= src_end
|| *src
!= '\n')
3375 this_eol_type
= CODING_EOL_CR
;
3377 this_eol_type
= CODING_EOL_CRLF
, src
++;
3379 if (eol_type
== CODING_EOL_UNDECIDED
)
3380 /* This is the first end-of-line. */
3381 eol_type
= this_eol_type
;
3382 else if (eol_type
!= this_eol_type
)
3384 /* The found type is different from what found before. */
3385 eol_type
= CODING_EOL_INCONSISTENT
;
3392 *skip
= src_end
- source
;
3396 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3397 is encoded. If it detects an appropriate format of end-of-line, it
3398 sets the information in *CODING. */
3401 detect_eol (coding
, src
, src_bytes
)
3402 struct coding_system
*coding
;
3408 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3410 if (coding
->heading_ascii
> skip
)
3411 coding
->heading_ascii
= skip
;
3413 skip
= coding
->heading_ascii
;
3415 if (eol_type
== CODING_EOL_UNDECIDED
)
3417 if (eol_type
== CODING_EOL_INCONSISTENT
)
3420 /* This code is suppressed until we find a better way to
3421 distinguish raw text file and binary file. */
3423 /* If we have already detected that the coding is raw-text, the
3424 coding should actually be no-conversion. */
3425 if (coding
->type
== coding_type_raw_text
)
3427 setup_coding_system (Qno_conversion
, coding
);
3430 /* Else, let's decode only text code anyway. */
3432 eol_type
= CODING_EOL_LF
;
3435 val
= Fget (coding
->symbol
, Qeol_type
);
3436 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3438 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3439 coding
->heading_ascii
= skip
;
3443 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3445 #define DECODING_BUFFER_MAG(coding) \
3446 (coding->type == coding_type_iso2022 \
3448 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3450 : (coding->type == coding_type_raw_text \
3452 : (coding->type == coding_type_ccl \
3453 ? coding->spec.ccl.decoder.buf_magnification \
3456 /* Return maximum size (bytes) of a buffer enough for decoding
3457 SRC_BYTES of text encoded in CODING. */
3460 decoding_buffer_size (coding
, src_bytes
)
3461 struct coding_system
*coding
;
3464 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3465 + CONVERSION_BUFFER_EXTRA_ROOM
);
3468 /* Return maximum size (bytes) of a buffer enough for encoding
3469 SRC_BYTES of text to CODING. */
3472 encoding_buffer_size (coding
, src_bytes
)
3473 struct coding_system
*coding
;
3478 if (coding
->type
== coding_type_ccl
)
3479 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3483 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3486 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3487 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3490 char *conversion_buffer
;
3491 int conversion_buffer_size
;
3493 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3494 or decoding. Sufficient memory is allocated automatically. If we
3495 run out of memory, return NULL. */
3498 get_conversion_buffer (size
)
3501 if (size
> conversion_buffer_size
)
3504 int real_size
= conversion_buffer_size
* 2;
3506 while (real_size
< size
) real_size
*= 2;
3507 buf
= (char *) xmalloc (real_size
);
3508 xfree (conversion_buffer
);
3509 conversion_buffer
= buf
;
3510 conversion_buffer_size
= real_size
;
3512 return conversion_buffer
;
3516 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3517 struct coding_system
*coding
;
3518 unsigned char *source
, *destination
;
3519 int src_bytes
, dst_bytes
, encodep
;
3521 struct ccl_program
*ccl
3522 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3525 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3526 src_bytes
, dst_bytes
, &(coding
->consumed
));
3529 coding
->produced_char
= coding
->produced
;
3530 coding
->consumed_char
3531 = multibyte_chars_in_text (source
, coding
->consumed
);
3535 coding
->produced_char
3536 = multibyte_chars_in_text (destination
, coding
->produced
);
3537 coding
->consumed_char
= coding
->consumed
;
3539 switch (ccl
->status
)
3541 case CCL_STAT_SUSPEND_BY_SRC
:
3542 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3544 case CCL_STAT_SUSPEND_BY_DST
:
3545 result
= CODING_FINISH_INSUFFICIENT_DST
;
3548 result
= CODING_FINISH_NORMAL
;
3554 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3555 decoding, it may detect coding system and format of end-of-line if
3556 those are not yet decided. */
3559 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3560 struct coding_system
*coding
;
3561 unsigned char *source
, *destination
;
3562 int src_bytes
, dst_bytes
;
3568 coding
->produced
= coding
->produced_char
= 0;
3569 coding
->consumed
= coding
->consumed_char
= 0;
3570 coding
->fake_multibyte
= 0;
3571 return CODING_FINISH_NORMAL
;
3574 if (coding
->type
== coding_type_undecided
)
3575 detect_coding (coding
, source
, src_bytes
);
3577 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3578 detect_eol (coding
, source
, src_bytes
);
3580 switch (coding
->type
)
3582 case coding_type_emacs_mule
:
3583 case coding_type_undecided
:
3584 case coding_type_raw_text
:
3585 if (coding
->eol_type
== CODING_EOL_LF
3586 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3587 goto label_no_conversion
;
3588 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3591 case coding_type_sjis
:
3592 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3593 src_bytes
, dst_bytes
, 1);
3596 case coding_type_iso2022
:
3597 result
= decode_coding_iso2022 (coding
, source
, destination
,
3598 src_bytes
, dst_bytes
);
3601 case coding_type_big5
:
3602 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3603 src_bytes
, dst_bytes
, 0);
3606 case coding_type_ccl
:
3607 result
= ccl_coding_driver (coding
, source
, destination
,
3608 src_bytes
, dst_bytes
, 0);
3611 default: /* i.e. case coding_type_no_conversion: */
3612 label_no_conversion
:
3613 if (dst_bytes
&& src_bytes
> dst_bytes
)
3615 coding
->produced
= dst_bytes
;
3616 result
= CODING_FINISH_INSUFFICIENT_DST
;
3620 coding
->produced
= src_bytes
;
3621 result
= CODING_FINISH_NORMAL
;
3624 bcopy (source
, destination
, coding
->produced
);
3626 safe_bcopy (source
, destination
, coding
->produced
);
3627 coding
->fake_multibyte
= 1;
3629 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3636 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3639 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3640 struct coding_system
*coding
;
3641 unsigned char *source
, *destination
;
3642 int src_bytes
, dst_bytes
;
3648 coding
->produced
= coding
->produced_char
= 0;
3649 coding
->consumed
= coding
->consumed_char
= 0;
3650 coding
->fake_multibyte
= 0;
3651 return CODING_FINISH_NORMAL
;
3654 switch (coding
->type
)
3656 case coding_type_emacs_mule
:
3657 case coding_type_undecided
:
3658 case coding_type_raw_text
:
3659 if (coding
->eol_type
== CODING_EOL_LF
3660 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3661 goto label_no_conversion
;
3662 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3665 case coding_type_sjis
:
3666 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3667 src_bytes
, dst_bytes
, 1);
3670 case coding_type_iso2022
:
3671 result
= encode_coding_iso2022 (coding
, source
, destination
,
3672 src_bytes
, dst_bytes
);
3675 case coding_type_big5
:
3676 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3677 src_bytes
, dst_bytes
, 0);
3680 case coding_type_ccl
:
3681 result
= ccl_coding_driver (coding
, source
, destination
,
3682 src_bytes
, dst_bytes
, 1);
3685 default: /* i.e. case coding_type_no_conversion: */
3686 label_no_conversion
:
3687 if (dst_bytes
&& src_bytes
> dst_bytes
)
3689 coding
->produced
= dst_bytes
;
3690 result
= CODING_FINISH_INSUFFICIENT_DST
;
3694 coding
->produced
= src_bytes
;
3695 result
= CODING_FINISH_NORMAL
;
3698 bcopy (source
, destination
, coding
->produced
);
3700 safe_bcopy (source
, destination
, coding
->produced
);
3701 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3703 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3705 if (*p
++ == '\015') p
[-1] = '\n';
3707 coding
->fake_multibyte
= 1;
3709 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3716 /* Scan text in the region between *BEG and *END (byte positions),
3717 skip characters which we don't have to decode by coding system
3718 CODING at the head and tail, then set *BEG and *END to the region
3719 of the text we actually have to convert. The caller should move
3720 the gap out of the region in advance.
3722 If STR is not NULL, *BEG and *END are indices into STR. */
3725 shrink_decoding_region (beg
, end
, coding
, str
)
3727 struct coding_system
*coding
;
3730 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3733 if (coding
->type
== coding_type_ccl
3734 || coding
->type
== coding_type_undecided
3735 || !NILP (coding
->post_read_conversion
))
3737 /* We can't skip any data. */
3740 else if (coding
->type
== coding_type_no_conversion
)
3742 /* We need no conversion, but don't have to skip any data here.
3743 Decoding routine handles them effectively anyway. */
3747 if (coding
->heading_ascii
>= 0)
3748 /* Detection routine has already found how much we can skip at the
3750 *beg
+= coding
->heading_ascii
;
3754 begp_orig
= begp
= str
+ *beg
;
3755 endp_orig
= endp
= str
+ *end
;
3759 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3760 endp_orig
= endp
= begp
+ *end
- *beg
;
3763 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3765 switch (coding
->type
)
3767 case coding_type_emacs_mule
:
3768 case coding_type_raw_text
:
3771 if (coding
->heading_ascii
< 0)
3772 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
3773 while (begp
< endp
&& *(endp
- 1) != '\r' && *(endp
- 1) < 0x80)
3780 case coding_type_sjis
:
3781 case coding_type_big5
:
3782 /* We can skip all ASCII characters at the head. */
3783 if (coding
->heading_ascii
< 0)
3786 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
3788 while (begp
< endp
&& *begp
< 0x80) begp
++;
3790 /* We can skip all ASCII characters at the tail except for the
3791 second byte of SJIS or BIG5 code. */
3793 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
3795 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3796 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
3800 default: /* i.e. case coding_type_iso2022: */
3801 if (coding
->heading_ascii
< 0)
3803 /* We can skip all ASCII characters at the head except for a
3804 few control codes. */
3805 while (begp
< endp
&& (c
= *begp
) < 0x80
3806 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3807 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
3808 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
3811 switch (coding
->category_idx
)
3813 case CODING_CATEGORY_IDX_ISO_8_1
:
3814 case CODING_CATEGORY_IDX_ISO_8_2
:
3815 /* We can skip all ASCII characters at the tail. */
3817 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
3819 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3822 case CODING_CATEGORY_IDX_ISO_7
:
3823 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
3824 /* We can skip all charactes at the tail except for ESC and
3825 the following 2-byte at the tail. */
3828 && (c
= endp
[-1]) < 0x80 && c
!= ISO_CODE_ESC
&& c
!= '\r')
3832 && (c
= endp
[-1]) < 0x80 && c
!= ISO_CODE_ESC
)
3834 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
3836 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
3837 /* This is an ASCII designation sequence. We can
3838 surely skip the tail. */
3841 /* Hmmm, we can't skip the tail. */
3846 *beg
+= begp
- begp_orig
;
3847 *end
+= endp
- endp_orig
;
3851 /* Like shrink_decoding_region but for encoding. */
3854 shrink_encoding_region (beg
, end
, coding
, str
)
3856 struct coding_system
*coding
;
3859 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
3862 if (coding
->type
== coding_type_ccl
)
3863 /* We can't skip any data. */
3865 else if (coding
->type
== coding_type_no_conversion
)
3867 /* We need no conversion. */
3874 begp_orig
= begp
= str
+ *beg
;
3875 endp_orig
= endp
= str
+ *end
;
3879 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3880 endp_orig
= endp
= begp
+ *end
- *beg
;
3883 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
3884 || coding
->eol_type
== CODING_EOL_CRLF
);
3886 /* Here, we don't have to check coding->pre_write_conversion because
3887 the caller is expected to have handled it already. */
3888 switch (coding
->type
)
3890 case coding_type_undecided
:
3891 case coding_type_emacs_mule
:
3892 case coding_type_raw_text
:
3895 while (begp
< endp
&& *begp
!= '\n') begp
++;
3896 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
3902 case coding_type_iso2022
:
3903 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3905 unsigned char *bol
= begp
;
3906 while (begp
< endp
&& *begp
< 0x80)
3909 if (begp
[-1] == '\n')
3913 goto label_skip_tail
;
3918 /* We can skip all ASCII characters at the head and tail. */
3920 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
3922 while (begp
< endp
&& *begp
< 0x80) begp
++;
3925 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
3927 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
3931 *beg
+= begp
- begp_orig
;
3932 *end
+= endp
- endp_orig
;
3936 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3937 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3938 coding system CODING, and return the status code of code conversion
3939 (currently, this value has no meaning).
3941 How many characters (and bytes) are converted to how many
3942 characters (and bytes) are recorded in members of the structure
3945 If REPLACE is nonzero, we do various things as if the original text
3946 is deleted and a new text is inserted. See the comments in
3947 replace_range (insdel.c) to know what we are doing. */
3950 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
3951 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
3952 struct coding_system
*coding
;
3954 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
3955 int require
, inserted
, inserted_byte
;
3956 int head_skip
, tail_skip
, total_skip
;
3957 Lisp_Object saved_coding_symbol
= Qnil
;
3958 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
3960 int fake_multibyte
= 0;
3961 unsigned char *src
, *dst
;
3962 Lisp_Object deletion
= Qnil
;
3966 int saved_from
= from
;
3968 prepare_to_modify_buffer (from
, to
, &from
);
3969 if (saved_from
!= from
)
3973 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
3975 from_byte
= from
, to_byte
= to
;
3976 len_byte
= to_byte
- from_byte
;
3980 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
3982 /* We must detect encoding of text and eol format. */
3984 if (from
< GPT
&& to
> GPT
)
3985 move_gap_both (from
, from_byte
);
3986 if (coding
->type
== coding_type_undecided
)
3988 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
3989 if (coding
->type
== coding_type_undecided
)
3990 /* It seems that the text contains only ASCII, but we
3991 should not left it undecided because the deeper
3992 decoding routine (decode_coding) tries to detect the
3993 encodings again in vain. */
3994 coding
->type
= coding_type_emacs_mule
;
3996 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3998 saved_coding_symbol
= coding
->symbol
;
3999 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4000 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4001 coding
->eol_type
= CODING_EOL_LF
;
4002 /* We had better recover the original eol format if we
4003 encounter an inconsitent eol format while decoding. */
4004 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4008 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4011 ? ! CODING_REQUIRE_ENCODING (coding
)
4012 : ! CODING_REQUIRE_DECODING (coding
))
4014 coding
->produced
= len_byte
;
4017 /* See the comment of the member heading_ascii in coding.h. */
4018 && coding
->heading_ascii
< len_byte
)
4020 /* We still may have to combine byte at the head and the
4021 tail of the text in the region. */
4022 if (from
< GPT
&& GPT
< to
)
4023 move_gap_both (to
, to_byte
);
4024 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4025 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4026 coding
->produced_char
= len
;
4029 coding
->produced_char
= len_byte
;
4033 /* Now we convert the text. */
4035 /* For encoding, we must process pre-write-conversion in advance. */
4037 && ! NILP (coding
->pre_write_conversion
)
4038 && SYMBOLP (coding
->pre_write_conversion
)
4039 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4041 /* The function in pre-write-conversion may put a new text in a
4043 struct buffer
*prev
= current_buffer
, *new;
4045 call2 (coding
->pre_write_conversion
, from
, to
);
4046 if (current_buffer
!= prev
)
4049 new = current_buffer
;
4050 set_buffer_internal_1 (prev
);
4051 del_range_2 (from
, from_byte
, to
, to_byte
);
4052 insert_from_buffer (new, BEG
, len
, 0);
4054 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4055 len_byte
= to_byte
- from_byte
;
4060 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4062 /* Try to skip the heading and tailing ASCIIs. */
4064 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4066 if (from
< GPT
&& GPT
< to
)
4067 move_gap_both (from
, from_byte
);
4069 shrink_encoding_region (&from_byte
, &to_byte
, coding
, NULL
);
4071 shrink_decoding_region (&from_byte
, &to_byte
, coding
, NULL
);
4072 if (from_byte
== to_byte
)
4074 coding
->produced
= len_byte
;
4075 coding
->produced_char
= multibyte
? len
: len_byte
;
4077 /* We must record and adjust for this new text now. */
4078 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4082 head_skip
= from_byte
- from_byte_orig
;
4083 tail_skip
= to_byte_orig
- to_byte
;
4084 total_skip
= head_skip
+ tail_skip
;
4087 len
-= total_skip
; len_byte
-= total_skip
;
4090 /* For converion, we must put the gap before the text in addition to
4091 making the gap larger for efficient decoding. The required gap
4092 size starts from 2000 which is the magic number used in make_gap.
4093 But, after one batch of conversion, it will be incremented if we
4094 find that it is not enough . */
4097 if (GAP_SIZE
< require
)
4098 make_gap (require
- GAP_SIZE
);
4099 move_gap_both (from
, from_byte
);
4101 if (GPT
- BEG
< beg_unchanged
)
4102 beg_unchanged
= GPT
- BEG
;
4103 if (Z
- GPT
< end_unchanged
)
4104 end_unchanged
= Z
- GPT
;
4106 inserted
= inserted_byte
= 0;
4107 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4109 GAP_SIZE
+= len_byte
;
4112 ZV_BYTE
-= len_byte
;
4119 /* The buffer memory is changed from:
4120 +--------+converted-text+---------+-------original-text------+---+
4121 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4122 |<------------------- GAP_SIZE -------------------->| */
4124 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4126 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4128 +--------+-------converted-text--------+--+---original-text--+---+
4129 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4130 |<------------------- GAP_SIZE -------------------->| */
4131 if (coding
->fake_multibyte
)
4134 if (!encodep
&& !multibyte
)
4135 coding
->produced_char
= coding
->produced
;
4136 inserted
+= coding
->produced_char
;
4137 inserted_byte
+= coding
->produced
;
4138 len_byte
-= coding
->consumed
;
4139 src
+= coding
->consumed
;
4140 dst
+= inserted_byte
;
4142 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4144 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4146 /* Encode LFs back to the original eol format (CR or CRLF). */
4147 if (coding
->eol_type
== CODING_EOL_CR
)
4149 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4155 while (p
< pend
) if (*p
++ == '\n') count
++;
4156 if (src
- dst
< count
)
4158 /* We don't have sufficient room for putting LFs
4159 back to CRLF. We must record converted and
4160 not-yet-converted text back to the buffer
4161 content, enlarge the gap, then record them out of
4162 the buffer contents again. */
4163 int add
= len_byte
+ inserted_byte
;
4166 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4167 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4168 make_gap (count
- GAP_SIZE
);
4170 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4171 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4172 /* Don't forget to update SRC, DST, and PEND. */
4173 src
= GAP_END_ADDR
- len_byte
;
4174 dst
= GPT_ADDR
+ inserted_byte
;
4178 inserted_byte
+= count
;
4179 coding
->produced
+= count
;
4180 p
= dst
= pend
+ count
;
4184 if (*p
== '\n') count
--, *--p
= '\r';
4188 /* Suppress eol-format conversion in the further conversion. */
4189 coding
->eol_type
= CODING_EOL_LF
;
4191 /* Restore the original symbol. */
4192 coding
->symbol
= saved_coding_symbol
;
4198 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4200 /* The source text ends in invalid codes. Let's just
4201 make them valid buffer contents, and finish conversion. */
4202 inserted
+= len_byte
;
4203 inserted_byte
+= len_byte
;
4211 /* We have just done the first batch of conversion which was
4212 stoped because of insufficient gap. Let's reconsider the
4213 required gap size (i.e. SRT - DST) now.
4215 We have converted ORIG bytes (== coding->consumed) into
4216 NEW bytes (coding->produced). To convert the remaining
4217 LEN bytes, we may need REQUIRE bytes of gap, where:
4218 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4219 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4220 Here, we are sure that NEW >= ORIG. */
4221 float ratio
= coding
->produced
- coding
->consumed
;
4222 ratio
/= coding
->consumed
;
4223 require
= len_byte
* ratio
;
4226 if ((src
- dst
) < (require
+ 2000))
4228 /* See the comment above the previous call of make_gap. */
4229 int add
= len_byte
+ inserted_byte
;
4232 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4233 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4234 make_gap (require
+ 2000);
4236 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4237 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4238 /* Don't forget to update SRC, DST. */
4239 src
= GAP_END_ADDR
- len_byte
;
4240 dst
= GPT_ADDR
+ inserted_byte
;
4243 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4247 || !encodep
&& (to
- from
) != (to_byte
- from_byte
)))
4248 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4250 /* If we have shrinked the conversion area, adjust it now. */
4254 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4255 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4256 GAP_SIZE
+= total_skip
;
4257 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4258 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4259 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4260 from
-= head_skip
; from_byte
-= head_skip
;
4261 to
+= tail_skip
; to_byte
+= tail_skip
;
4264 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4266 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4269 int orig_inserted
= inserted
, pos
= PT
;
4272 temp_set_point_both (current_buffer
, from
, from_byte
);
4273 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4276 CHECK_NUMBER (val
, 0);
4277 inserted
= XFASTINT (val
);
4279 if (pos
>= from
+ orig_inserted
)
4280 temp_set_point (current_buffer
, pos
+ (inserted
- orig_inserted
));
4283 signal_after_change (from
, to
- from
, inserted
);
4286 coding
->consumed
= to_byte
- from_byte
;
4287 coding
->consumed_char
= to
- from
;
4288 coding
->produced
= inserted_byte
;
4289 coding
->produced_char
= inserted
;
4296 code_convert_string (str
, coding
, encodep
, nocopy
)
4298 struct coding_system
*coding
;
4299 int encodep
, nocopy
;
4303 int from
= 0, to
= XSTRING (str
)->size
;
4304 int to_byte
= STRING_BYTES (XSTRING (str
));
4305 struct gcpro gcpro1
;
4306 Lisp_Object saved_coding_symbol
= Qnil
;
4309 if (encodep
&& !NILP (coding
->pre_write_conversion
)
4310 || !encodep
&& !NILP (coding
->post_read_conversion
))
4312 /* Since we have to call Lisp functions which assume target text
4313 is in a buffer, after setting a temporary buffer, call
4314 code_convert_region. */
4315 int count
= specpdl_ptr
- specpdl
;
4316 struct buffer
*prev
= current_buffer
;
4318 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4319 temp_output_buffer_setup (" *code-converting-work*");
4320 set_buffer_internal (XBUFFER (Vstandard_output
));
4322 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4325 /* We must insert the contents of STR as is without
4326 unibyte<->multibyte conversion. */
4327 current_buffer
->enable_multibyte_characters
= Qnil
;
4328 insert_from_string (str
, 0, 0, to_byte
, to_byte
, 0);
4329 current_buffer
->enable_multibyte_characters
= Qt
;
4331 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4333 /* We must return the buffer contents as unibyte string. */
4334 current_buffer
->enable_multibyte_characters
= Qnil
;
4335 str
= make_buffer_string (BEGV
, ZV
, 0);
4336 set_buffer_internal (prev
);
4337 return unbind_to (count
, str
);
4340 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4342 /* See the comments in code_convert_region. */
4343 if (coding
->type
== coding_type_undecided
)
4345 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4346 if (coding
->type
== coding_type_undecided
)
4347 coding
->type
= coding_type_emacs_mule
;
4349 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4351 saved_coding_symbol
= coding
->symbol
;
4352 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4353 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4354 coding
->eol_type
= CODING_EOL_LF
;
4355 /* We had better recover the original eol format if we
4356 encounter an inconsitent eol format while decoding. */
4357 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4362 ? ! CODING_REQUIRE_ENCODING (coding
)
4363 : ! CODING_REQUIRE_DECODING (coding
))
4367 /* Try to skip the heading and tailing ASCIIs. */
4369 shrink_encoding_region (&from
, &to_byte
, coding
, XSTRING (str
)->data
);
4371 shrink_decoding_region (&from
, &to_byte
, coding
, XSTRING (str
)->data
);
4373 if (from
== to_byte
)
4374 return (nocopy
? str
: Fcopy_sequence (str
));
4377 len
= encoding_buffer_size (coding
, to_byte
- from
);
4379 len
= decoding_buffer_size (coding
, to_byte
- from
);
4380 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4382 buf
= get_conversion_buffer (len
);
4386 bcopy (XSTRING (str
)->data
, buf
, from
);
4388 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4389 buf
+ from
, to_byte
- from
, len
)
4390 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4391 buf
+ from
, to
- from
, len
));
4392 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4394 /* We simple try to decode the whole string again but without
4395 eol-conversion this time. */
4396 coding
->eol_type
= CODING_EOL_LF
;
4397 coding
->symbol
= saved_coding_symbol
;
4398 return code_convert_string (str
, coding
, encodep
, nocopy
);
4401 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4402 STRING_BYTES (XSTRING (str
)) - to_byte
);
4404 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4406 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4408 str
= make_string_from_bytes (buf
, len
+ coding
->produced_char
,
4409 len
+ coding
->produced
);
4415 /*** 7. Emacs Lisp library functions ***/
4417 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4418 "Return t if OBJECT is nil or a coding-system.\n\
4419 See the documentation of `make-coding-system' for information\n\
4420 about coding-system objects.")
4428 /* Get coding-spec vector for OBJ. */
4429 obj
= Fget (obj
, Qcoding_system
);
4430 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4434 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4435 Sread_non_nil_coding_system
, 1, 1, 0,
4436 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4443 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4444 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4446 while (XSTRING (val
)->size
== 0);
4447 return (Fintern (val
, Qnil
));
4450 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4451 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4452 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4453 (prompt
, default_coding_system
)
4454 Lisp_Object prompt
, default_coding_system
;
4457 if (SYMBOLP (default_coding_system
))
4458 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4459 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4460 Qt
, Qnil
, Qcoding_system_history
,
4461 default_coding_system
, Qnil
);
4462 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4465 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4467 "Check validity of CODING-SYSTEM.\n\
4468 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4469 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4470 The value of property should be a vector of length 5.")
4472 Lisp_Object coding_system
;
4474 CHECK_SYMBOL (coding_system
, 0);
4475 if (!NILP (Fcoding_system_p (coding_system
)))
4476 return coding_system
;
4478 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4482 detect_coding_system (src
, src_bytes
, highest
)
4484 int src_bytes
, highest
;
4486 int coding_mask
, eol_type
;
4487 Lisp_Object val
, tmp
;
4490 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4491 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4492 if (eol_type
== CODING_EOL_INCONSISTENT
)
4493 eol_type
== CODING_EOL_UNDECIDED
;
4498 if (eol_type
!= CODING_EOL_UNDECIDED
)
4501 val2
= Fget (Qundecided
, Qeol_type
);
4503 val
= XVECTOR (val2
)->contents
[eol_type
];
4508 /* At first, gather possible coding systems in VAL. */
4510 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4513 = XFASTINT (Fget (XCONS (tmp
)->car
, Qcoding_category_index
));
4514 if (coding_mask
& (1 << idx
))
4516 val
= Fcons (Fsymbol_value (XCONS (tmp
)->car
), val
);
4522 val
= Fnreverse (val
);
4524 /* Then, substitute the elements by subsidiary coding systems. */
4525 for (tmp
= val
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4527 if (eol_type
!= CODING_EOL_UNDECIDED
)
4530 eol
= Fget (XCONS (tmp
)->car
, Qeol_type
);
4532 XCONS (tmp
)->car
= XVECTOR (eol
)->contents
[eol_type
];
4535 return (highest
? XCONS (val
)->car
: val
);
4538 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4540 "Detect coding system of the text in the region between START and END.\n\
4541 Return a list of possible coding systems ordered by priority.\n\
4543 If only ASCII characters are found, it returns `undecided'\n\
4544 or its subsidiary coding system according to a detected end-of-line format.\n\
4546 If optional argument HIGHEST is non-nil, return the coding system of\n\
4548 (start
, end
, highest
)
4549 Lisp_Object start
, end
, highest
;
4552 int from_byte
, to_byte
;
4554 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4555 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4557 validate_region (&start
, &end
);
4558 from
= XINT (start
), to
= XINT (end
);
4559 from_byte
= CHAR_TO_BYTE (from
);
4560 to_byte
= CHAR_TO_BYTE (to
);
4562 if (from
< GPT
&& to
>= GPT
)
4563 move_gap_both (to
, to_byte
);
4565 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4566 to_byte
- from_byte
,
4570 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4572 "Detect coding system of the text in STRING.\n\
4573 Return a list of possible coding systems ordered by priority.\n\
4575 If only ASCII characters are found, it returns `undecided'\n\
4576 or its subsidiary coding system according to a detected end-of-line format.\n\
4578 If optional argument HIGHEST is non-nil, return the coding system of\n\
4581 Lisp_Object string
, highest
;
4583 CHECK_STRING (string
, 0);
4585 return detect_coding_system (XSTRING (string
)->data
,
4586 STRING_BYTES (XSTRING (string
)),
4591 code_convert_region1 (start
, end
, coding_system
, encodep
)
4592 Lisp_Object start
, end
, coding_system
;
4595 struct coding_system coding
;
4598 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4599 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4600 CHECK_SYMBOL (coding_system
, 2);
4602 validate_region (&start
, &end
);
4603 from
= XFASTINT (start
);
4604 to
= XFASTINT (end
);
4606 if (NILP (coding_system
))
4607 return make_number (to
- from
);
4609 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4610 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4612 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4613 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
4614 &coding
, encodep
, 1);
4615 return make_number (coding
.produced_char
);
4618 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
4619 3, 3, "r\nzCoding system: ",
4620 "Decode the current region by specified coding system.\n\
4621 When called from a program, takes three arguments:\n\
4622 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4623 Return length of decoded text.")
4624 (start
, end
, coding_system
)
4625 Lisp_Object start
, end
, coding_system
;
4627 return code_convert_region1 (start
, end
, coding_system
, 0);
4630 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
4631 3, 3, "r\nzCoding system: ",
4632 "Encode the current region by specified coding system.\n\
4633 When called from a program, takes three arguments:\n\
4634 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4635 Return length of encoded text.")
4636 (start
, end
, coding_system
)
4637 Lisp_Object start
, end
, coding_system
;
4639 return code_convert_region1 (start
, end
, coding_system
, 1);
4643 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
4644 Lisp_Object string
, coding_system
, nocopy
;
4647 struct coding_system coding
;
4649 CHECK_STRING (string
, 0);
4650 CHECK_SYMBOL (coding_system
, 1);
4652 if (NILP (coding_system
))
4653 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
4655 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4656 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4658 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4659 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
4662 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
4664 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4665 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4666 if the decoding operation is trivial.")
4667 (string
, coding_system
, nocopy
)
4668 Lisp_Object string
, coding_system
, nocopy
;
4670 return code_convert_string1(string
, coding_system
, nocopy
, 0);
4673 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
4675 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4676 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4677 if the encoding operation is trivial.")
4678 (string
, coding_system
, nocopy
)
4679 Lisp_Object string
, coding_system
, nocopy
;
4681 return code_convert_string1(string
, coding_system
, nocopy
, 1);
4685 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
4686 "Decode a JISX0208 character of shift-jis encoding.\n\
4687 CODE is the character code in SJIS.\n\
4688 Return the corresponding character.")
4692 unsigned char c1
, c2
, s1
, s2
;
4695 CHECK_NUMBER (code
, 0);
4696 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
4697 DECODE_SJIS (s1
, s2
, c1
, c2
);
4698 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
4702 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
4703 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4704 Return the corresponding character code in SJIS.")
4708 int charset
, c1
, c2
, s1
, s2
;
4711 CHECK_NUMBER (ch
, 0);
4712 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
4713 if (charset
== charset_jisx0208
)
4715 ENCODE_SJIS (c1
, c2
, s1
, s2
);
4716 XSETFASTINT (val
, (s1
<< 8) | s2
);
4719 XSETFASTINT (val
, 0);
4723 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
4724 "Decode a Big5 character CODE of BIG5 coding system.\n\
4725 CODE is the character code in BIG5.\n\
4726 Return the corresponding character.")
4731 unsigned char b1
, b2
, c1
, c2
;
4734 CHECK_NUMBER (code
, 0);
4735 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
4736 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
4737 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
4741 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
4742 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4743 Return the corresponding character code in Big5.")
4747 int charset
, c1
, c2
, b1
, b2
;
4750 CHECK_NUMBER (ch
, 0);
4751 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
4752 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
4754 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
4755 XSETFASTINT (val
, (b1
<< 8) | b2
);
4758 XSETFASTINT (val
, 0);
4762 DEFUN ("set-terminal-coding-system-internal",
4763 Fset_terminal_coding_system_internal
,
4764 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
4766 Lisp_Object coding_system
;
4768 CHECK_SYMBOL (coding_system
, 0);
4769 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
4770 /* We had better not send unsafe characters to terminal. */
4771 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
4776 DEFUN ("set-safe-terminal-coding-system-internal",
4777 Fset_safe_terminal_coding_system_internal
,
4778 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
4780 Lisp_Object coding_system
;
4782 CHECK_SYMBOL (coding_system
, 0);
4783 setup_coding_system (Fcheck_coding_system (coding_system
),
4784 &safe_terminal_coding
);
4788 DEFUN ("terminal-coding-system",
4789 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
4790 "Return coding system specified for terminal output.")
4793 return terminal_coding
.symbol
;
4796 DEFUN ("set-keyboard-coding-system-internal",
4797 Fset_keyboard_coding_system_internal
,
4798 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
4800 Lisp_Object coding_system
;
4802 CHECK_SYMBOL (coding_system
, 0);
4803 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
4807 DEFUN ("keyboard-coding-system",
4808 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
4809 "Return coding system specified for decoding keyboard input.")
4812 return keyboard_coding
.symbol
;
4816 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
4817 Sfind_operation_coding_system
, 1, MANY
, 0,
4818 "Choose a coding system for an operation based on the target name.\n\
4819 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4820 DECODING-SYSTEM is the coding system to use for decoding\n\
4821 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4822 for encoding (in case OPERATION does encoding).\n\
4824 The first argument OPERATION specifies an I/O primitive:\n\
4825 For file I/O, `insert-file-contents' or `write-region'.\n\
4826 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4827 For network I/O, `open-network-stream'.\n\
4829 The remaining arguments should be the same arguments that were passed\n\
4830 to the primitive. Depending on which primitive, one of those arguments\n\
4831 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4832 whichever argument specifies the file name is TARGET.\n\
4834 TARGET has a meaning which depends on OPERATION:\n\
4835 For file I/O, TARGET is a file name.\n\
4836 For process I/O, TARGET is a process name.\n\
4837 For network I/O, TARGET is a service name or a port number\n\
4839 This function looks up what specified for TARGET in,\n\
4840 `file-coding-system-alist', `process-coding-system-alist',\n\
4841 or `network-coding-system-alist' depending on OPERATION.\n\
4842 They may specify a coding system, a cons of coding systems,\n\
4843 or a function symbol to call.\n\
4844 In the last case, we call the function with one argument,\n\
4845 which is a list of all the arguments given to this function.")
4850 Lisp_Object operation
, target_idx
, target
, val
;
4851 register Lisp_Object chain
;
4854 error ("Too few arguments");
4855 operation
= args
[0];
4856 if (!SYMBOLP (operation
)
4857 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
4858 error ("Invalid first arguement");
4859 if (nargs
< 1 + XINT (target_idx
))
4860 error ("Too few arguments for operation: %s",
4861 XSYMBOL (operation
)->name
->data
);
4862 target
= args
[XINT (target_idx
) + 1];
4863 if (!(STRINGP (target
)
4864 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
4865 error ("Invalid %dth argument", XINT (target_idx
) + 1);
4867 chain
= ((EQ (operation
, Qinsert_file_contents
)
4868 || EQ (operation
, Qwrite_region
))
4869 ? Vfile_coding_system_alist
4870 : (EQ (operation
, Qopen_network_stream
)
4871 ? Vnetwork_coding_system_alist
4872 : Vprocess_coding_system_alist
));
4876 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
4879 elt
= XCONS (chain
)->car
;
4882 && ((STRINGP (target
)
4883 && STRINGP (XCONS (elt
)->car
)
4884 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
4885 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
4887 val
= XCONS (elt
)->cdr
;
4888 /* Here, if VAL is both a valid coding system and a valid
4889 function symbol, we return VAL as a coding system. */
4892 if (! SYMBOLP (val
))
4894 if (! NILP (Fcoding_system_p (val
)))
4895 return Fcons (val
, val
);
4896 if (! NILP (Ffboundp (val
)))
4898 val
= call1 (val
, Flist (nargs
, args
));
4901 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
4902 return Fcons (val
, val
);
4910 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems
,
4911 Supdate_iso_coding_systems
, 0, 0, 0,
4912 "Update internal database for ISO2022 based coding systems.\n\
4913 When values of the following coding categories are changed, you must\n\
4914 call this function:\n\
4915 coding-category-iso-7, coding-category-iso-7-tight,\n\
4916 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4917 coding-category-iso-7-else, coding-category-iso-8-else")
4922 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_ISO_8_ELSE
;
4925 if (! coding_system_table
[i
])
4926 coding_system_table
[i
]
4927 = (struct coding_system
*) xmalloc (sizeof (struct coding_system
));
4929 (XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
,
4930 coding_system_table
[i
]);
4938 /*** 8. Post-amble ***/
4944 /* Emacs' internal format specific initialize routine. */
4945 for (i
= 0; i
<= 0x20; i
++)
4946 emacs_code_class
[i
] = EMACS_control_code
;
4947 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
4948 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
4949 for (i
= 0x21 ; i
< 0x7F; i
++)
4950 emacs_code_class
[i
] = EMACS_ascii_code
;
4951 emacs_code_class
[0x7F] = EMACS_control_code
;
4952 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
4953 for (i
= 0x81; i
< 0xFF; i
++)
4954 emacs_code_class
[i
] = EMACS_invalid_code
;
4955 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
4956 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
4957 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
4958 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
4960 /* ISO2022 specific initialize routine. */
4961 for (i
= 0; i
< 0x20; i
++)
4962 iso_code_class
[i
] = ISO_control_code
;
4963 for (i
= 0x21; i
< 0x7F; i
++)
4964 iso_code_class
[i
] = ISO_graphic_plane_0
;
4965 for (i
= 0x80; i
< 0xA0; i
++)
4966 iso_code_class
[i
] = ISO_control_code
;
4967 for (i
= 0xA1; i
< 0xFF; i
++)
4968 iso_code_class
[i
] = ISO_graphic_plane_1
;
4969 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
4970 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
4971 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
4972 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
4973 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
4974 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
4975 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
4976 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
4977 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
4978 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
4980 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
4981 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
4983 setup_coding_system (Qnil
, &keyboard_coding
);
4984 setup_coding_system (Qnil
, &terminal_coding
);
4985 setup_coding_system (Qnil
, &safe_terminal_coding
);
4987 bzero (coding_system_table
, sizeof coding_system_table
);
4989 #if defined (MSDOS) || defined (WINDOWSNT)
4990 system_eol_type
= CODING_EOL_CRLF
;
4992 system_eol_type
= CODING_EOL_LF
;
5000 Qtarget_idx
= intern ("target-idx");
5001 staticpro (&Qtarget_idx
);
5003 Qcoding_system_history
= intern ("coding-system-history");
5004 staticpro (&Qcoding_system_history
);
5005 Fset (Qcoding_system_history
, Qnil
);
5007 /* Target FILENAME is the first argument. */
5008 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5009 /* Target FILENAME is the third argument. */
5010 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5012 Qcall_process
= intern ("call-process");
5013 staticpro (&Qcall_process
);
5014 /* Target PROGRAM is the first argument. */
5015 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5017 Qcall_process_region
= intern ("call-process-region");
5018 staticpro (&Qcall_process_region
);
5019 /* Target PROGRAM is the third argument. */
5020 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5022 Qstart_process
= intern ("start-process");
5023 staticpro (&Qstart_process
);
5024 /* Target PROGRAM is the third argument. */
5025 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5027 Qopen_network_stream
= intern ("open-network-stream");
5028 staticpro (&Qopen_network_stream
);
5029 /* Target SERVICE is the fourth argument. */
5030 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5032 Qcoding_system
= intern ("coding-system");
5033 staticpro (&Qcoding_system
);
5035 Qeol_type
= intern ("eol-type");
5036 staticpro (&Qeol_type
);
5038 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5039 staticpro (&Qbuffer_file_coding_system
);
5041 Qpost_read_conversion
= intern ("post-read-conversion");
5042 staticpro (&Qpost_read_conversion
);
5044 Qpre_write_conversion
= intern ("pre-write-conversion");
5045 staticpro (&Qpre_write_conversion
);
5047 Qno_conversion
= intern ("no-conversion");
5048 staticpro (&Qno_conversion
);
5050 Qundecided
= intern ("undecided");
5051 staticpro (&Qundecided
);
5053 Qcoding_system_p
= intern ("coding-system-p");
5054 staticpro (&Qcoding_system_p
);
5056 Qcoding_system_error
= intern ("coding-system-error");
5057 staticpro (&Qcoding_system_error
);
5059 Fput (Qcoding_system_error
, Qerror_conditions
,
5060 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5061 Fput (Qcoding_system_error
, Qerror_message
,
5062 build_string ("Invalid coding system"));
5064 Qcoding_category
= intern ("coding-category");
5065 staticpro (&Qcoding_category
);
5066 Qcoding_category_index
= intern ("coding-category-index");
5067 staticpro (&Qcoding_category_index
);
5069 Vcoding_category_table
5070 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5071 staticpro (&Vcoding_category_table
);
5074 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5076 XVECTOR (Vcoding_category_table
)->contents
[i
]
5077 = intern (coding_category_name
[i
]);
5078 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5079 Qcoding_category_index
, make_number (i
));
5083 Qcharacter_unification_table
= intern ("character-unification-table");
5084 staticpro (&Qcharacter_unification_table
);
5085 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
5088 Qcharacter_unification_table_for_decode
5089 = intern ("character-unification-table-for-decode");
5090 staticpro (&Qcharacter_unification_table_for_decode
);
5092 Qcharacter_unification_table_for_encode
5093 = intern ("character-unification-table-for-encode");
5094 staticpro (&Qcharacter_unification_table_for_encode
);
5096 Qsafe_charsets
= intern ("safe-charsets");
5097 staticpro (&Qsafe_charsets
);
5099 Qemacs_mule
= intern ("emacs-mule");
5100 staticpro (&Qemacs_mule
);
5102 Qraw_text
= intern ("raw-text");
5103 staticpro (&Qraw_text
);
5105 defsubr (&Scoding_system_p
);
5106 defsubr (&Sread_coding_system
);
5107 defsubr (&Sread_non_nil_coding_system
);
5108 defsubr (&Scheck_coding_system
);
5109 defsubr (&Sdetect_coding_region
);
5110 defsubr (&Sdetect_coding_string
);
5111 defsubr (&Sdecode_coding_region
);
5112 defsubr (&Sencode_coding_region
);
5113 defsubr (&Sdecode_coding_string
);
5114 defsubr (&Sencode_coding_string
);
5115 defsubr (&Sdecode_sjis_char
);
5116 defsubr (&Sencode_sjis_char
);
5117 defsubr (&Sdecode_big5_char
);
5118 defsubr (&Sencode_big5_char
);
5119 defsubr (&Sset_terminal_coding_system_internal
);
5120 defsubr (&Sset_safe_terminal_coding_system_internal
);
5121 defsubr (&Sterminal_coding_system
);
5122 defsubr (&Sset_keyboard_coding_system_internal
);
5123 defsubr (&Skeyboard_coding_system
);
5124 defsubr (&Sfind_operation_coding_system
);
5125 defsubr (&Supdate_iso_coding_systems
);
5127 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5128 "List of coding systems.\n\
5130 Do not alter the value of this variable manually. This variable should be\n\
5131 updated by the functions `make-coding-system' and\n\
5132 `define-coding-system-alias'.");
5133 Vcoding_system_list
= Qnil
;
5135 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5136 "Alist of coding system names.\n\
5137 Each element is one element list of coding system name.\n\
5138 This variable is given to `completing-read' as TABLE argument.\n\
5140 Do not alter the value of this variable manually. This variable should be\n\
5141 updated by the functions `make-coding-system' and\n\
5142 `define-coding-system-alias'.");
5143 Vcoding_system_alist
= Qnil
;
5145 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5146 "List of coding-categories (symbols) ordered by priority.");
5150 Vcoding_category_list
= Qnil
;
5151 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5152 Vcoding_category_list
5153 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5154 Vcoding_category_list
);
5157 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5158 "Specify the coding system for read operations.\n\
5159 It is useful to bind this variable with `let', but do not set it globally.\n\
5160 If the value is a coding system, it is used for decoding on read operation.\n\
5161 If not, an appropriate element is used from one of the coding system alists:\n\
5162 There are three such tables, `file-coding-system-alist',\n\
5163 `process-coding-system-alist', and `network-coding-system-alist'.");
5164 Vcoding_system_for_read
= Qnil
;
5166 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5167 "Specify the coding system for write operations.\n\
5168 It is useful to bind this variable with `let', but do not set it globally.\n\
5169 If the value is a coding system, it is used for encoding on write operation.\n\
5170 If not, an appropriate element is used from one of the coding system alists:\n\
5171 There are three such tables, `file-coding-system-alist',\n\
5172 `process-coding-system-alist', and `network-coding-system-alist'.");
5173 Vcoding_system_for_write
= Qnil
;
5175 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5176 "Coding system used in the latest file or process I/O.");
5177 Vlast_coding_system_used
= Qnil
;
5179 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5180 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5181 inhibit_eol_conversion
= 0;
5183 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5184 "Alist to decide a coding system to use for a file I/O operation.\n\
5185 The format is ((PATTERN . VAL) ...),\n\
5186 where PATTERN is a regular expression matching a file name,\n\
5187 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5188 If VAL is a coding system, it is used for both decoding and encoding\n\
5189 the file contents.\n\
5190 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5191 and the cdr part is used for encoding.\n\
5192 If VAL is a function symbol, the function must return a coding system\n\
5193 or a cons of coding systems which are used as above.\n\
5195 See also the function `find-operation-coding-system'.");
5196 Vfile_coding_system_alist
= Qnil
;
5198 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5199 "Alist to decide a coding system to use for a process I/O operation.\n\
5200 The format is ((PATTERN . VAL) ...),\n\
5201 where PATTERN is a regular expression matching a program name,\n\
5202 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5203 If VAL is a coding system, it is used for both decoding what received\n\
5204 from the program and encoding what sent to the program.\n\
5205 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5206 and the cdr part is used for encoding.\n\
5207 If VAL is a function symbol, the function must return a coding system\n\
5208 or a cons of coding systems which are used as above.\n\
5210 See also the function `find-operation-coding-system'.");
5211 Vprocess_coding_system_alist
= Qnil
;
5213 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5214 "Alist to decide a coding system to use for a network I/O operation.\n\
5215 The format is ((PATTERN . VAL) ...),\n\
5216 where PATTERN is a regular expression matching a network service name\n\
5217 or is a port number to connect to,\n\
5218 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5219 If VAL is a coding system, it is used for both decoding what received\n\
5220 from the network stream and encoding what sent to the network stream.\n\
5221 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5222 and the cdr part is used for encoding.\n\
5223 If VAL is a function symbol, the function must return a coding system\n\
5224 or a cons of coding systems which are used as above.\n\
5226 See also the function `find-operation-coding-system'.");
5227 Vnetwork_coding_system_alist
= Qnil
;
5229 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5230 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5231 eol_mnemonic_unix
= ':';
5233 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5234 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5235 eol_mnemonic_dos
= '\\';
5237 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5238 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5239 eol_mnemonic_mac
= '/';
5241 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5242 "Mnemonic character indicating end-of-line format is not yet decided.");
5243 eol_mnemonic_undecided
= ':';
5245 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
5246 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5247 Venable_character_unification
= Qt
;
5249 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5250 &Vstandard_character_unification_table_for_decode
,
5251 "Table for unifying characters when reading.");
5252 Vstandard_character_unification_table_for_decode
= Qnil
;
5254 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5255 &Vstandard_character_unification_table_for_encode
,
5256 "Table for unifying characters when writing.");
5257 Vstandard_character_unification_table_for_encode
= Qnil
;
5259 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5260 "Alist of charsets vs revision numbers.\n\
5261 While encoding, if a charset (car part of an element) is found,\n\
5262 designate it with the escape sequence identifing revision (cdr part of the element).");
5263 Vcharset_revision_alist
= Qnil
;
5265 DEFVAR_LISP ("default-process-coding-system",
5266 &Vdefault_process_coding_system
,
5267 "Cons of coding systems used for process I/O by default.\n\
5268 The car part is used for decoding a process output,\n\
5269 the cdr part is used for encoding a text to be sent to a process.");
5270 Vdefault_process_coding_system
= Qnil
;
5272 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5273 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5274 This is a vector of length 256.\n\
5275 If Nth element is non-nil, the existence of code N in a file\n\
5276 \(or output of subprocess) doesn't prevent it to be detected as\n\
5277 a coding system of ISO 2022 variant which has a flag\n\
5278 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5279 or reading output of a subprocess.\n\
5280 Only 128th through 159th elements has a meaning.");
5281 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5283 DEFVAR_LISP ("select-safe-coding-system-function",
5284 &Vselect_safe_coding_system_function
,
5285 "Function to call to select safe coding system for encoding a text.\n\
5287 If set, this function is called to force a user to select a proper\n\
5288 coding system which can encode the text in the case that a default\n\
5289 coding system used in each operation can't encode the text.\n\
5291 The default value is `select-safe-codign-system' (which see).");
5292 Vselect_safe_coding_system_function
= Qnil
;