1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
107 detect_coding_emacs_mule (src
, src_end
)
108 unsigned char *src
, *src_end
;
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
123 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
124 struct coding_system
*coding
;
125 unsigned char *source
, *destination
;
126 int src_bytes
, dst_bytes
;
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
142 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
143 struct coding_system
*coding
;
144 unsigned char *source
, *destination
;
145 int src_bytes
, dst_bytes
;
152 /*** COMMONLY USED MACROS ***/
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
160 #define ONE_MORE_BYTE(c1) \
165 goto label_end_of_loop; \
168 #define TWO_MORE_BYTES(c1, c2) \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
173 goto label_end_of_loop; \
176 #define THREE_MORE_BYTES(c1, c2, c3) \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
181 goto label_end_of_loop; \
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
191 /* Decode one ASCII character C. */
193 #define DECODE_CHARACTER_ASCII(c) \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
226 /*** 1. Preamble ***/
240 #else /* not emacs */
244 #endif /* not emacs */
246 Lisp_Object Qcoding_system
, Qeol_type
;
247 Lisp_Object Qbuffer_file_coding_system
;
248 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
250 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
251 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
252 Lisp_Object Qstart_process
, Qopen_network_stream
;
253 Lisp_Object Qtarget_idx
;
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
257 /* Mnemonic character to indicate format of end-of-line is not yet
259 int eol_mnemonic_undecided
;
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
267 Lisp_Object Qcoding_system_spec
, Qcoding_system_p
, Qcoding_system_error
;
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule
;
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read
;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write
;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used
;
281 /* Flag to inhibit code conversion of end-of-line format. */
282 int inhibit_eol_conversion
;
284 /* Coding-system of what terminal accept for displaying. */
285 struct coding_system terminal_coding
;
287 /* Coding-system of what is sent from terminal keyboard. */
288 struct coding_system keyboard_coding
;
290 Lisp_Object Vfile_coding_system_alist
;
291 Lisp_Object Vprocess_coding_system_alist
;
292 Lisp_Object Vnetwork_coding_system_alist
;
296 Lisp_Object Qcoding_category_index
;
298 /* List of symbols `coding-category-xxx' ordered by priority. */
299 Lisp_Object Vcoding_category_list
;
301 /* Table of coding-systems currently assigned to each coding-category. */
302 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
304 /* Table of names of symbol for each coding-category. */
305 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
306 "coding-category-emacs-mule",
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
311 "coding-category-iso-7-else",
312 "coding-category-iso-8-else",
313 "coding-category-big5",
314 "coding-category-binary"
317 /* Flag to tell if we look up unification table on character code
319 Lisp_Object Venable_character_unification
;
320 /* Standard unification table to look up on decoding (reading). */
321 Lisp_Object Vstandard_character_unification_table_for_decode
;
322 /* Standard unification table to look up on encoding (writing). */
323 Lisp_Object Vstandard_character_unification_table_for_encode
;
325 Lisp_Object Qcharacter_unification_table
;
326 Lisp_Object Qcharacter_unification_table_for_decode
;
327 Lisp_Object Qcharacter_unification_table_for_encode
;
329 /* Alist of charsets vs revision number. */
330 Lisp_Object Vcharset_revision_alist
;
332 /* Default coding systems used for process I/O. */
333 Lisp_Object Vdefault_process_coding_system
;
336 /*** 2. Emacs internal format (emacs-mule) handlers ***/
338 /* Emacs' internal format for encoding multiple character sets is a
339 kind of multi-byte encoding, i.e. characters are encoded by
340 variable-length sequences of one-byte codes. ASCII characters
341 and control characters (e.g. `tab', `newline') are represented by
342 one-byte sequences which are their ASCII codes, in the range 0x00
343 through 0x7F. The other characters are represented by a sequence
344 of `base leading-code', optional `extended leading-code', and one
345 or two `position-code's. The length of the sequence is determined
346 by the base leading-code. Leading-code takes the range 0x80
347 through 0x9F, whereas extended leading-code and position-code take
348 the range 0xA0 through 0xFF. See `charset.h' for more details
349 about leading-code and position-code.
351 There's one exception to this rule. Special leading-code
352 `leading-code-composition' denotes that the following several
353 characters should be composed into one character. Leading-codes of
354 components (except for ASCII) are added 0x20. An ASCII character
355 component is represented by a 2-byte sequence of `0xA0' and
356 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
357 details of composite character. Hence, we can summarize the code
360 --- CODE RANGE of Emacs' internal format ---
361 (character set) (range)
363 ELSE (1st byte) 0x80 .. 0x9F
364 (rest bytes) 0xA0 .. 0xFF
365 ---------------------------------------------
369 enum emacs_code_class_type emacs_code_class
[256];
371 /* Go to the next statement only if *SRC is accessible and the code is
372 greater than 0xA0. */
373 #define CHECK_CODE_RANGE_A0_FF \
375 if (src >= src_end) \
376 goto label_end_of_switch; \
377 else if (*src++ < 0xA0) \
381 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
382 Check if a text is encoded in Emacs' internal format. If it is,
383 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
386 detect_coding_emacs_mule (src
, src_end
)
387 unsigned char *src
, *src_end
;
392 while (src
< src_end
)
404 switch (emacs_code_class
[c
])
406 case EMACS_ascii_code
:
407 case EMACS_linefeed_code
:
410 case EMACS_control_code
:
411 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
415 case EMACS_invalid_code
:
418 case EMACS_leading_code_composition
: /* c == 0x80 */
420 CHECK_CODE_RANGE_A0_FF
;
425 case EMACS_leading_code_4
:
426 CHECK_CODE_RANGE_A0_FF
;
427 /* fall down to check it two more times ... */
429 case EMACS_leading_code_3
:
430 CHECK_CODE_RANGE_A0_FF
;
431 /* fall down to check it one more time ... */
433 case EMACS_leading_code_2
:
434 CHECK_CODE_RANGE_A0_FF
;
442 return CODING_CATEGORY_MASK_EMACS_MULE
;
446 /*** 3. ISO2022 handlers ***/
448 /* The following note describes the coding system ISO2022 briefly.
449 Since the intention of this note is to help in understanding of
450 the programs in this file, some parts are NOT ACCURATE or OVERLY
451 SIMPLIFIED. For the thorough understanding, please refer to the
452 original document of ISO2022.
454 ISO2022 provides many mechanisms to encode several character sets
455 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
456 all text is encoded by codes of less than 128. This may make the
457 encoded text a little bit longer, but the text gets more stability
458 to pass through several gateways (some of them strip off the MSB).
460 There are two kinds of character set: control character set and
461 graphic character set. The former contains control characters such
462 as `newline' and `escape' to provide control functions (control
463 functions are provided also by escape sequences). The latter
464 contains graphic characters such as ' A' and '-'. Emacs recognizes
465 two control character sets and many graphic character sets.
467 Graphic character sets are classified into one of the following
468 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
469 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
470 bytes (DIMENSION) and the number of characters in one dimension
471 (CHARS) of the set. In addition, each character set is assigned an
472 identification tag (called "final character" and denoted as <F>
473 here after) which is unique in each class. <F> of each character
474 set is decided by ECMA(*) when it is registered in ISO. Code range
475 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
477 Note (*): ECMA = European Computer Manufacturers Association
479 Here are examples of graphic character set [NAME(<F>)]:
480 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
481 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
482 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
483 o DIMENSION2_CHARS96 -- none for the moment
485 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
486 C0 [0x00..0x1F] -- control character plane 0
487 GL [0x20..0x7F] -- graphic character plane 0
488 C1 [0x80..0x9F] -- control character plane 1
489 GR [0xA0..0xFF] -- graphic character plane 1
491 A control character set is directly designated and invoked to C0 or
492 C1 by an escape sequence. The most common case is that ISO646's
493 control character set is designated/invoked to C0 and ISO6429's
494 control character set is designated/invoked to C1, and usually
495 these designations/invocations are omitted in a coded text. With
496 7-bit environment, only C0 can be used, and a control character for
497 C1 is encoded by an appropriate escape sequence to fit in the
498 environment. All control characters for C1 are defined the
499 corresponding escape sequences.
501 A graphic character set is at first designated to one of four
502 graphic registers (G0 through G3), then these graphic registers are
503 invoked to GL or GR. These designations and invocations can be
504 done independently. The most common case is that G0 is invoked to
505 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
506 these invocations and designations are omitted in a coded text.
507 With 7-bit environment, only GL can be used.
509 When a graphic character set of CHARS94 is invoked to GL, code 0x20
510 and 0x7F of GL area work as control characters SPACE and DEL
511 respectively, and code 0xA0 and 0xFF of GR area should not be used.
513 There are two ways of invocation: locking-shift and single-shift.
514 With locking-shift, the invocation lasts until the next different
515 invocation, whereas with single-shift, the invocation works only
516 for the following character and doesn't affect locking-shift.
517 Invocations are done by the following control characters or escape
520 ----------------------------------------------------------------------
521 function control char escape sequence description
522 ----------------------------------------------------------------------
523 SI (shift-in) 0x0F none invoke G0 to GL
524 SO (shift-out) 0x0E none invoke G1 to GL
525 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
526 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
527 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
528 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
529 ----------------------------------------------------------------------
530 The first four are for locking-shift. Control characters for these
531 functions are defined by macros ISO_CODE_XXX in `coding.h'.
533 Designations are done by the following escape sequences.
534 ----------------------------------------------------------------------
535 escape sequence description
536 ----------------------------------------------------------------------
537 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
538 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
539 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
540 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
541 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
542 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
543 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
544 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
545 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
546 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
547 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
548 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
549 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
550 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
551 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
552 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
553 ----------------------------------------------------------------------
555 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
556 of dimension 1, chars 94, and final character <F>, and etc.
558 Note (*): Although these designations are not allowed in ISO2022,
559 Emacs accepts them on decoding, and produces them on encoding
560 CHARS96 character set in a coding system which is characterized as
561 7-bit environment, non-locking-shift, and non-single-shift.
563 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
564 '(' can be omitted. We call this as "short-form" here after.
566 Now you may notice that there are a lot of ways for encoding the
567 same multilingual text in ISO2022. Actually, there exists many
568 coding systems such as Compound Text (used in X's inter client
569 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
570 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
571 localized platforms), and all of these are variants of ISO2022.
573 In addition to the above, Emacs handles two more kinds of escape
574 sequences: ISO6429's direction specification and Emacs' private
575 sequence for specifying character composition.
577 ISO6429's direction specification takes the following format:
578 o CSI ']' -- end of the current direction
579 o CSI '0' ']' -- end of the current direction
580 o CSI '1' ']' -- start of left-to-right text
581 o CSI '2' ']' -- start of right-to-left text
582 The control character CSI (0x9B: control sequence introducer) is
583 abbreviated to the escape sequence ESC '[' in 7-bit environment.
585 Character composition specification takes the following format:
586 o ESC '0' -- start character composition
587 o ESC '1' -- end character composition
588 Since these are not standard escape sequences of any ISO, the use
589 of them for these meaning is restricted to Emacs only. */
591 enum iso_code_class_type iso_code_class
[256];
593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
594 Check if a text is encoded in ISO2022. If it is, returns an
595 integer in which appropriate flag bits any of:
596 CODING_CATEGORY_MASK_ISO_7
597 CODING_CATEGORY_MASK_ISO_8_1
598 CODING_CATEGORY_MASK_ISO_8_2
599 CODING_CATEGORY_MASK_ISO_7_ELSE
600 CODING_CATEGORY_MASK_ISO_8_ELSE
601 are set. If a code which should never appear in ISO2022 is found,
605 detect_coding_iso2022 (src
, src_end
)
606 unsigned char *src
, *src_end
;
608 int mask
= (CODING_CATEGORY_MASK_ISO_7
609 | CODING_CATEGORY_MASK_ISO_8_1
610 | CODING_CATEGORY_MASK_ISO_8_2
611 | CODING_CATEGORY_MASK_ISO_7_ELSE
612 | CODING_CATEGORY_MASK_ISO_8_ELSE
614 int g1
= 0; /* 1 iff designating to G1. */
617 while (src
< src_end
)
626 if ((c
>= '(' && c
<= '/'))
628 /* Designation sequence for a charset of dimension 1. */
632 if (c
< ' ' || c
>= 0x80)
633 /* Invalid designation sequence. */
638 /* Designation sequence for a charset of dimension 2. */
642 if (c
>= '@' && c
<= 'B')
643 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
645 else if (c
>= '(' && c
<= '/')
650 if (c
< ' ' || c
>= 0x80)
651 /* Invalid designation sequence. */
655 /* Invalid designation sequence. */
658 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
660 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
661 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
662 else if (c
== '0' || c
== '1' || c
== '2')
663 /* Start/end composition. */
666 /* Invalid escape sequence. */
671 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
672 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
678 return CODING_CATEGORY_MASK_ISO_8_ELSE
;
687 unsigned char *src_begin
= src
;
689 mask
&= ~(CODING_CATEGORY_MASK_ISO_7
690 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
691 while (src
< src_end
&& *src
>= 0xA0)
693 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
694 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
703 /* Decode a character of which charset is CHARSET and the 1st position
704 code is C1. If dimension of CHARSET is 2, the 2nd position code is
705 fetched from SRC and set to C2. If CHARSET is negative, it means
706 that we are decoding ill formed text, and what we can do is just to
709 #define DECODE_ISO_CHARACTER(charset, c1) \
711 int c_alt, charset_alt = (charset); \
712 if (COMPOSING_HEAD_P (coding->composing)) \
714 *dst++ = LEADING_CODE_COMPOSITION; \
715 if (COMPOSING_WITH_RULE_P (coding->composing)) \
716 /* To tell composition rules are embeded. */ \
718 coding->composing += 2; \
720 if ((charset) >= 0) \
722 if (CHARSET_DIMENSION (charset) == 2) \
723 ONE_MORE_BYTE (c2); \
724 if (!NILP (unification_table) \
725 && ((c_alt = unify_char (unification_table, \
726 -1, (charset), c1, c2)) >= 0)) \
727 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
729 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
730 DECODE_CHARACTER_ASCII (c1); \
731 else if (CHARSET_DIMENSION (charset_alt) == 1) \
732 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
734 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
735 if (COMPOSING_WITH_RULE_P (coding->composing)) \
736 /* To tell a composition rule follows. */ \
737 coding->composing = COMPOSING_WITH_RULE_RULE; \
740 /* Set designation state into CODING. */
741 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
743 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
744 make_number (chars), \
745 make_number (final_char)); \
748 if (coding->direction == 1 \
749 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
750 charset = CHARSET_REVERSE_CHARSET (charset); \
751 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
755 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
758 decode_coding_iso2022 (coding
, source
, destination
,
759 src_bytes
, dst_bytes
, consumed
)
760 struct coding_system
*coding
;
761 unsigned char *source
, *destination
;
762 int src_bytes
, dst_bytes
;
765 unsigned char *src
= source
;
766 unsigned char *src_end
= source
+ src_bytes
;
767 unsigned char *dst
= destination
;
768 unsigned char *dst_end
= destination
+ dst_bytes
;
769 /* Since the maximum bytes produced by each loop is 7, we subtract 6
770 from DST_END to assure that overflow checking is necessary only
771 at the head of loop. */
772 unsigned char *adjusted_dst_end
= dst_end
- 6;
774 /* Charsets invoked to graphic plane 0 and 1 respectively. */
775 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
776 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
777 Lisp_Object unification_table
778 = coding
->character_unification_table_for_decode
;
780 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
781 unification_table
= Vstandard_character_unification_table_for_decode
;
783 while (src
< src_end
&& dst
< adjusted_dst_end
)
785 /* SRC_BASE remembers the start position in source in each loop.
786 The loop will be exited when there's not enough source text
787 to analyze long escape sequence or 2-byte code (within macros
788 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
789 to SRC_BASE before exiting. */
790 unsigned char *src_base
= src
;
793 switch (iso_code_class
[c1
])
795 case ISO_0x20_or_0x7F
:
796 if (!coding
->composing
797 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
799 /* This is SPACE or DEL. */
803 /* This is a graphic character, we fall down ... */
805 case ISO_graphic_plane_0
:
806 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
808 /* This is a composition rule. */
810 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
813 DECODE_ISO_CHARACTER (charset0
, c1
);
816 case ISO_0xA0_or_0xFF
:
817 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
823 /* This is a graphic character, we fall down ... */
825 case ISO_graphic_plane_1
:
826 DECODE_ISO_CHARACTER (charset1
, c1
);
829 case ISO_control_code
:
830 /* All ISO2022 control characters in this class have the
831 same representation in Emacs internal format. */
835 case ISO_carriage_return
:
836 if (coding
->eol_type
== CODING_EOL_CR
)
840 else if (coding
->eol_type
== CODING_EOL_CRLF
)
843 if (c1
== ISO_CODE_LF
)
858 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
859 goto label_invalid_escape_sequence
;
860 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
861 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
865 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
866 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
869 case ISO_single_shift_2_7
:
870 case ISO_single_shift_2
:
871 /* SS2 is handled as an escape sequence of ESC 'N' */
873 goto label_escape_sequence
;
875 case ISO_single_shift_3
:
876 /* SS2 is handled as an escape sequence of ESC 'O' */
878 goto label_escape_sequence
;
880 case ISO_control_sequence_introducer
:
881 /* CSI is handled as an escape sequence of ESC '[' ... */
883 goto label_escape_sequence
;
887 label_escape_sequence
:
888 /* Escape sequences handled by Emacs are invocation,
889 designation, direction specification, and character
890 composition specification. */
893 case '&': /* revision of following character set */
895 if (!(c1
>= '@' && c1
<= '~'))
896 goto label_invalid_escape_sequence
;
898 if (c1
!= ISO_CODE_ESC
)
899 goto label_invalid_escape_sequence
;
901 goto label_escape_sequence
;
903 case '$': /* designation of 2-byte character set */
905 if (c1
>= '@' && c1
<= 'B')
906 { /* designation of JISX0208.1978, GB2312.1980,
908 DECODE_DESIGNATION (0, 2, 94, c1
);
910 else if (c1
>= 0x28 && c1
<= 0x2B)
911 { /* designation of DIMENSION2_CHARS94 character set */
913 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
915 else if (c1
>= 0x2C && c1
<= 0x2F)
916 { /* designation of DIMENSION2_CHARS96 character set */
918 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
921 goto label_invalid_escape_sequence
;
924 case 'n': /* invocation of locking-shift-2 */
925 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
926 goto label_invalid_escape_sequence
;
927 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
928 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
931 case 'o': /* invocation of locking-shift-3 */
932 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
933 goto label_invalid_escape_sequence
;
934 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
935 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
938 case 'N': /* invocation of single-shift-2 */
939 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
940 goto label_invalid_escape_sequence
;
942 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
943 DECODE_ISO_CHARACTER (charset
, c1
);
946 case 'O': /* invocation of single-shift-3 */
947 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
948 goto label_invalid_escape_sequence
;
950 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
951 DECODE_ISO_CHARACTER (charset
, c1
);
954 case '0': /* start composing without embeded rules */
955 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
958 case '1': /* end composing */
959 coding
->composing
= COMPOSING_NO
;
962 case '2': /* start composing with embeded rules */
963 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
966 case '[': /* specification of direction */
967 /* For the moment, nested direction is not supported.
968 So, the value of `coding->direction' is 0 or 1: 0
969 means left-to-right, 1 means right-to-left. */
973 case ']': /* end of the current direction */
974 coding
->direction
= 0;
976 case '0': /* end of the current direction */
977 case '1': /* start of left-to-right direction */
980 coding
->direction
= 0;
982 goto label_invalid_escape_sequence
;
985 case '2': /* start of right-to-left direction */
988 coding
->direction
= 1;
990 goto label_invalid_escape_sequence
;
994 goto label_invalid_escape_sequence
;
999 if (c1
>= 0x28 && c1
<= 0x2B)
1000 { /* designation of DIMENSION1_CHARS94 character set */
1002 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1004 else if (c1
>= 0x2C && c1
<= 0x2F)
1005 { /* designation of DIMENSION1_CHARS96 character set */
1007 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1011 goto label_invalid_escape_sequence
;
1014 /* We must update these variables now. */
1015 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1016 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1019 label_invalid_escape_sequence
:
1021 int length
= src
- src_base
;
1023 bcopy (src_base
, dst
, length
);
1030 coding
->carryover_size
= src
- src_base
;
1031 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1036 /* If this is the last block of the text to be decoded, we had
1037 better just flush out all remaining codes in the text although
1038 they are not valid characters. */
1039 if (coding
->last_block
)
1041 bcopy (src
, dst
, src_end
- src
);
1042 dst
+= (src_end
- src
);
1045 *consumed
= src
- source
;
1046 return dst
- destination
;
1049 /* ISO2022 encoding stuff. */
1052 It is not enough to say just "ISO2022" on encoding, we have to
1053 specify more details. In Emacs, each coding-system of ISO2022
1054 variant has the following specifications:
1055 1. Initial designation to G0 thru G3.
1056 2. Allows short-form designation?
1057 3. ASCII should be designated to G0 before control characters?
1058 4. ASCII should be designated to G0 at end of line?
1059 5. 7-bit environment or 8-bit environment?
1060 6. Use locking-shift?
1061 7. Use Single-shift?
1062 And the following two are only for Japanese:
1063 8. Use ASCII in place of JIS0201-1976-Roman?
1064 9. Use JISX0208-1983 in place of JISX0208-1978?
1065 These specifications are encoded in `coding->flags' as flag bits
1066 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1070 /* Produce codes (escape sequence) for designating CHARSET to graphic
1071 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1072 the coding system CODING allows, produce designation sequence of
1075 #define ENCODE_DESIGNATION(charset, reg, coding) \
1077 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1078 char *intermediate_char_94 = "()*+"; \
1079 char *intermediate_char_96 = ",-./"; \
1081 = Fassq (make_number (charset), Vcharset_revision_alist); \
1082 if (! NILP (temp)) \
1084 *dst++ = ISO_CODE_ESC; \
1086 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1088 *dst++ = ISO_CODE_ESC; \
1089 if (CHARSET_DIMENSION (charset) == 1) \
1091 if (CHARSET_CHARS (charset) == 94) \
1092 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1094 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1099 if (CHARSET_CHARS (charset) == 94) \
1101 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1103 || final_char < '@' || final_char > 'B') \
1104 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1107 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1109 *dst++ = final_char; \
1110 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1113 /* The following two macros produce codes (control character or escape
1114 sequence) for ISO2022 single-shift functions (single-shift-2 and
1117 #define ENCODE_SINGLE_SHIFT_2 \
1119 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1120 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1122 *dst++ = ISO_CODE_SS2; \
1123 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1126 #define ENCODE_SINGLE_SHIFT_3 \
1128 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1129 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1131 *dst++ = ISO_CODE_SS3; \
1132 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1135 /* The following four macros produce codes (control character or
1136 escape sequence) for ISO2022 locking-shift functions (shift-in,
1137 shift-out, locking-shift-2, and locking-shift-3). */
1139 #define ENCODE_SHIFT_IN \
1141 *dst++ = ISO_CODE_SI; \
1142 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1145 #define ENCODE_SHIFT_OUT \
1147 *dst++ = ISO_CODE_SO; \
1148 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1151 #define ENCODE_LOCKING_SHIFT_2 \
1153 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1154 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1157 #define ENCODE_LOCKING_SHIFT_3 \
1159 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1160 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1163 /* Produce codes for a DIMENSION1 character whose character set is
1164 CHARSET and whose position-code is C1. Designation and invocation
1165 sequences are also produced in advance if necessary. */
1168 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1170 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = c1 & 0x7F; \
1175 *dst++ = c1 | 0x80; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1179 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1181 *dst++ = c1 & 0x7F; \
1184 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1186 *dst++ = c1 | 0x80; \
1190 /* Since CHARSET is not yet invoked to any graphic planes, we \
1191 must invoke it, or, at first, designate it to some graphic \
1192 register. Then repeat the loop to actually produce the \
1194 dst = encode_invocation_designation (charset, coding, dst); \
1197 /* Produce codes for a DIMENSION2 character whose character set is
1198 CHARSET and whose position-codes are C1 and C2. Designation and
1199 invocation codes are also produced in advance if necessary. */
1201 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1203 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1205 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1206 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1208 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1209 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1212 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1214 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1217 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1219 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1223 /* Since CHARSET is not yet invoked to any graphic planes, we \
1224 must invoke it, or, at first, designate it to some graphic \
1225 register. Then repeat the loop to actually produce the \
1227 dst = encode_invocation_designation (charset, coding, dst); \
1230 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1232 int c_alt, charset_alt; \
1233 if (!NILP (unification_table) \
1234 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1236 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1238 charset_alt = charset; \
1239 if (CHARSET_DIMENSION (charset_alt) == 1) \
1240 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1242 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1245 /* Produce designation and invocation codes at a place pointed by DST
1246 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1250 encode_invocation_designation (charset
, coding
, dst
)
1252 struct coding_system
*coding
;
1255 int reg
; /* graphic register number */
1257 /* At first, check designations. */
1258 for (reg
= 0; reg
< 4; reg
++)
1259 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1264 /* CHARSET is not yet designated to any graphic registers. */
1265 /* At first check the requested designation. */
1266 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1267 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1268 /* Since CHARSET requests no special designation, designate it
1269 to graphic register 0. */
1272 ENCODE_DESIGNATION (charset
, reg
, coding
);
1275 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1276 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1278 /* Since the graphic register REG is not invoked to any graphic
1279 planes, invoke it to graphic plane 0. */
1282 case 0: /* graphic register 0 */
1286 case 1: /* graphic register 1 */
1290 case 2: /* graphic register 2 */
1291 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1292 ENCODE_SINGLE_SHIFT_2
;
1294 ENCODE_LOCKING_SHIFT_2
;
1297 case 3: /* graphic register 3 */
1298 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1299 ENCODE_SINGLE_SHIFT_3
;
1301 ENCODE_LOCKING_SHIFT_3
;
1308 /* The following two macros produce codes for indicating composition. */
1309 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1310 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1311 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1313 /* The following three macros produce codes for indicating direction
1315 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1317 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1318 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1320 *dst++ = ISO_CODE_CSI; \
1323 #define ENCODE_DIRECTION_R2L \
1324 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1326 #define ENCODE_DIRECTION_L2R \
1327 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1329 /* Produce codes for designation and invocation to reset the graphic
1330 planes and registers to initial state. */
1331 #define ENCODE_RESET_PLANE_AND_REGISTER \
1334 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1336 for (reg = 0; reg < 4; reg++) \
1337 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1338 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1339 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1340 ENCODE_DESIGNATION \
1341 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1344 /* Produce designation sequences of charsets in the line started from
1345 *SRC to a place pointed by DSTP.
1347 If the current block ends before any end-of-line, we may fail to
1348 find all the necessary *designations. */
1349 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1350 struct coding_system
*coding
;
1352 unsigned char *src
, *src_end
, **dstp
;
1354 int charset
, c
, found
= 0, reg
;
1355 /* Table of charsets to be designated to each graphic register. */
1357 unsigned char *dst
= *dstp
;
1359 for (reg
= 0; reg
< 4; reg
++)
1362 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1364 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1367 charset
= CHARSET_AT (src
);
1372 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1373 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1374 charset
= CHAR_CHARSET (c_alt
);
1377 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1378 if (r
[reg
] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1389 for (reg
= 0; reg
< 4; reg
++)
1391 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1392 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1397 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1400 encode_coding_iso2022 (coding
, source
, destination
,
1401 src_bytes
, dst_bytes
, consumed
)
1402 struct coding_system
*coding
;
1403 unsigned char *source
, *destination
;
1404 int src_bytes
, dst_bytes
;
1407 unsigned char *src
= source
;
1408 unsigned char *src_end
= source
+ src_bytes
;
1409 unsigned char *dst
= destination
;
1410 unsigned char *dst_end
= destination
+ dst_bytes
;
1411 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1412 from DST_END to assure overflow checking is necessary only at the
1414 unsigned char *adjusted_dst_end
= dst_end
- 19;
1415 Lisp_Object unification_table
1416 = coding
->character_unification_table_for_encode
;
1418 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1419 unification_table
= Vstandard_character_unification_table_for_encode
;
1421 while (src
< src_end
&& dst
< adjusted_dst_end
)
1423 /* SRC_BASE remembers the start position in source in each loop.
1424 The loop will be exited when there's not enough source text
1425 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1426 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1427 reset to SRC_BASE before exiting. */
1428 unsigned char *src_base
= src
;
1429 int charset
, c1
, c2
, c3
, c4
;
1431 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1432 && CODING_SPEC_ISO_BOL (coding
))
1434 /* We have to produce designation sequences if any now. */
1435 encode_designation_at_bol (coding
, unification_table
,
1436 src
, src_end
, &dst
);
1437 CODING_SPEC_ISO_BOL (coding
) = 0;
1441 /* If we are seeing a component of a composite character, we are
1442 seeing a leading-code specially encoded for composition, or a
1443 composition rule if composing with rule. We must set C1
1444 to a normal leading-code or an ASCII code. If we are not at
1445 a composed character, we must reset the composition state. */
1446 if (COMPOSING_P (coding
->composing
))
1450 /* We are not in a composite character any longer. */
1451 coding
->composing
= COMPOSING_NO
;
1452 ENCODE_COMPOSITION_END
;
1456 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1459 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1462 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1463 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1466 /* This is an ASCII component. */
1471 /* This is a leading-code of non ASCII component. */
1476 /* Now encode one character. C1 is a control character, an
1477 ASCII character, or a leading-code of multi-byte character. */
1478 switch (emacs_code_class
[c1
])
1480 case EMACS_ascii_code
:
1481 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1484 case EMACS_control_code
:
1485 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1486 ENCODE_RESET_PLANE_AND_REGISTER
;
1490 case EMACS_carriage_return_code
:
1491 if (!coding
->selective
)
1493 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1494 ENCODE_RESET_PLANE_AND_REGISTER
;
1498 /* fall down to treat '\r' as '\n' ... */
1500 case EMACS_linefeed_code
:
1501 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1502 ENCODE_RESET_PLANE_AND_REGISTER
;
1503 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1504 bcopy (coding
->spec
.iso2022
.initial_designation
,
1505 coding
->spec
.iso2022
.current_designation
,
1506 sizeof coding
->spec
.iso2022
.initial_designation
);
1507 if (coding
->eol_type
== CODING_EOL_LF
1508 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1509 *dst
++ = ISO_CODE_LF
;
1510 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1511 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1513 *dst
++ = ISO_CODE_CR
;
1514 CODING_SPEC_ISO_BOL (coding
) = 1;
1517 case EMACS_leading_code_2
:
1521 /* invalid sequence */
1526 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1529 case EMACS_leading_code_3
:
1530 TWO_MORE_BYTES (c2
, c3
);
1531 if (c2
< 0xA0 || c3
< 0xA0)
1533 /* invalid sequence */
1538 else if (c1
< LEADING_CODE_PRIVATE_11
)
1539 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1541 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1544 case EMACS_leading_code_4
:
1545 THREE_MORE_BYTES (c2
, c3
, c4
);
1546 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1548 /* invalid sequence */
1555 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1558 case EMACS_leading_code_composition
:
1562 /* invalid sequence */
1566 else if (c2
== 0xFF)
1568 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1569 ENCODE_COMPOSITION_WITH_RULE_START
;
1573 /* Rewind one byte because it is a character code of
1574 composition elements. */
1576 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1577 ENCODE_COMPOSITION_NO_RULE_START
;
1581 case EMACS_invalid_code
:
1587 /* We reach here because the source date ends not at character
1589 coding
->carryover_size
= src_end
- src_base
;
1590 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1595 /* If this is the last block of the text to be encoded, we must
1596 reset graphic planes and registers to the initial state. */
1597 if (src
>= src_end
&& coding
->last_block
)
1599 ENCODE_RESET_PLANE_AND_REGISTER
;
1600 if (coding
->carryover_size
> 0
1601 && coding
->carryover_size
< (dst_end
- dst
))
1603 bcopy (coding
->carryover
, dst
, coding
->carryover_size
);
1604 dst
+= coding
->carryover_size
;
1605 coding
->carryover_size
= 0;
1608 *consumed
= src
- source
;
1609 return dst
- destination
;
1613 /*** 4. SJIS and BIG5 handlers ***/
1615 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1616 quite widely. So, for the moment, Emacs supports them in the bare
1617 C code. But, in the future, they may be supported only by CCL. */
1619 /* SJIS is a coding system encoding three character sets: ASCII, right
1620 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1621 as is. A character of charset katakana-jisx0201 is encoded by
1622 "position-code + 0x80". A character of charset japanese-jisx0208
1623 is encoded in 2-byte but two position-codes are divided and shifted
1624 so that it fit in the range below.
1626 --- CODE RANGE of SJIS ---
1627 (character set) (range)
1629 KATAKANA-JISX0201 0xA0 .. 0xDF
1630 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1631 (2nd byte) 0x40 .. 0xFF
1632 -------------------------------
1636 /* BIG5 is a coding system encoding two character sets: ASCII and
1637 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1638 character set and is encoded in two-byte.
1640 --- CODE RANGE of BIG5 ---
1641 (character set) (range)
1643 Big5 (1st byte) 0xA1 .. 0xFE
1644 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1645 --------------------------
1647 Since the number of characters in Big5 is larger than maximum
1648 characters in Emacs' charset (96x96), it can't be handled as one
1649 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1650 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1651 contains frequently used characters and the latter contains less
1652 frequently used characters. */
1654 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1655 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1656 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1657 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1659 /* Number of Big5 characters which have the same code in 1st byte. */
1660 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1662 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1665 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1667 charset = charset_big5_1; \
1670 charset = charset_big5_2; \
1671 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1673 c1 = temp / (0xFF - 0xA1) + 0x21; \
1674 c2 = temp % (0xFF - 0xA1) + 0x21; \
1677 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1679 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1680 if (charset == charset_big5_2) \
1681 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1682 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1683 b2 = temp % BIG5_SAME_ROW; \
1684 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1687 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1689 int c_alt, charset_alt = (charset); \
1690 if (!NILP (unification_table) \
1691 && ((c_alt = unify_char (unification_table, \
1692 -1, (charset), c1, c2)) >= 0)) \
1693 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1694 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1695 DECODE_CHARACTER_ASCII (c1); \
1696 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1697 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1699 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1702 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1704 int c_alt, charset_alt; \
1705 if (!NILP (unification_table) \
1706 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1708 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1710 charset_alt = charset; \
1711 if (charset_alt == charset_ascii) \
1713 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1715 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1718 *dst++ = charset_alt, *dst++ = c1; \
1722 c1 &= 0x7F, c2 &= 0x7F; \
1723 if (sjis_p && charset_alt == charset_jisx0208) \
1725 unsigned char s1, s2; \
1727 ENCODE_SJIS (c1, c2, s1, s2); \
1728 *dst++ = s1, *dst++ = s2; \
1731 && (charset_alt == charset_big5_1 \
1732 || charset_alt == charset_big5_2)) \
1734 unsigned char b1, b2; \
1736 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1737 *dst++ = b1, *dst++ = b2; \
1740 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1744 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1745 Check if a text is encoded in SJIS. If it is, return
1746 CODING_CATEGORY_MASK_SJIS, else return 0. */
1749 detect_coding_sjis (src
, src_end
)
1750 unsigned char *src
, *src_end
;
1754 while (src
< src_end
)
1757 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1759 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1761 if (src
< src_end
&& *src
++ < 0x40)
1765 return CODING_CATEGORY_MASK_SJIS
;
1768 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1769 Check if a text is encoded in BIG5. If it is, return
1770 CODING_CATEGORY_MASK_BIG5, else return 0. */
1773 detect_coding_big5 (src
, src_end
)
1774 unsigned char *src
, *src_end
;
1778 while (src
< src_end
)
1781 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1788 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1792 return CODING_CATEGORY_MASK_BIG5
;
1795 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1796 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1799 decode_coding_sjis_big5 (coding
, source
, destination
,
1800 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1801 struct coding_system
*coding
;
1802 unsigned char *source
, *destination
;
1803 int src_bytes
, dst_bytes
;
1807 unsigned char *src
= source
;
1808 unsigned char *src_end
= source
+ src_bytes
;
1809 unsigned char *dst
= destination
;
1810 unsigned char *dst_end
= destination
+ dst_bytes
;
1811 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1812 from DST_END to assure overflow checking is necessary only at the
1814 unsigned char *adjusted_dst_end
= dst_end
- 3;
1815 Lisp_Object unification_table
1816 = coding
->character_unification_table_for_decode
;
1818 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1819 unification_table
= Vstandard_character_unification_table_for_decode
;
1821 while (src
< src_end
&& dst
< adjusted_dst_end
)
1823 /* SRC_BASE remembers the start position in source in each loop.
1824 The loop will be exited when there's not enough source text
1825 to analyze two-byte character (within macro ONE_MORE_BYTE).
1826 In that case, SRC is reset to SRC_BASE before exiting. */
1827 unsigned char *src_base
= src
;
1828 unsigned char c1
= *src
++, c2
, c3
, c4
;
1832 if (coding
->eol_type
== CODING_EOL_CRLF
)
1838 /* To process C2 again, SRC is subtracted by 1. */
1847 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1848 else if (c1
< 0xA0 || c1
>= 0xE0)
1850 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1854 DECODE_SJIS (c1
, c2
, c3
, c4
);
1855 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
1857 else if (c1
>= 0xE0 && c1
< 0xFF)
1862 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1863 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1865 else /* Invalid code */
1870 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1872 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
, /* dummy */ c2
);
1878 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1879 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1885 coding
->carryover_size
= src
- src_base
;
1886 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1891 *consumed
= src
- source
;
1892 return dst
- destination
;
1895 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1896 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1897 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1898 sure that all these charsets are registered as official charset
1899 (i.e. do not have extended leading-codes). Characters of other
1900 charsets are produced without any encoding. If SJIS_P is 1, encode
1901 SJIS text, else encode BIG5 text. */
1904 encode_coding_sjis_big5 (coding
, source
, destination
,
1905 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1906 struct coding_system
*coding
;
1907 unsigned char *source
, *destination
;
1908 int src_bytes
, dst_bytes
;
1912 unsigned char *src
= source
;
1913 unsigned char *src_end
= source
+ src_bytes
;
1914 unsigned char *dst
= destination
;
1915 unsigned char *dst_end
= destination
+ dst_bytes
;
1916 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1917 from DST_END to assure overflow checking is necessary only at the
1919 unsigned char *adjusted_dst_end
= dst_end
- 1;
1920 Lisp_Object unification_table
1921 = coding
->character_unification_table_for_encode
;
1923 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1924 unification_table
= Vstandard_character_unification_table_for_encode
;
1926 while (src
< src_end
&& dst
< adjusted_dst_end
)
1928 /* SRC_BASE remembers the start position in source in each loop.
1929 The loop will be exited when there's not enough source text
1930 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1931 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1933 unsigned char *src_base
= src
;
1934 unsigned char c1
= *src
++, c2
, c3
, c4
;
1936 if (coding
->composing
)
1943 else if (c1
>= 0xA0)
1946 coding
->composing
= 0;
1949 switch (emacs_code_class
[c1
])
1951 case EMACS_ascii_code
:
1952 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1955 case EMACS_control_code
:
1959 case EMACS_carriage_return_code
:
1960 if (!coding
->selective
)
1965 /* fall down to treat '\r' as '\n' ... */
1967 case EMACS_linefeed_code
:
1968 if (coding
->eol_type
== CODING_EOL_LF
1969 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1971 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1972 *dst
++ = '\r', *dst
++ = '\n';
1977 case EMACS_leading_code_2
:
1979 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
1982 case EMACS_leading_code_3
:
1983 TWO_MORE_BYTES (c2
, c3
);
1984 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
1987 case EMACS_leading_code_4
:
1988 THREE_MORE_BYTES (c2
, c3
, c4
);
1989 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
1992 case EMACS_leading_code_composition
:
1993 coding
->composing
= 1;
1996 default: /* i.e. case EMACS_invalid_code: */
2002 coding
->carryover_size
= src_end
- src_base
;
2003 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2008 *consumed
= src
- source
;
2009 return dst
- destination
;
2013 /*** 5. End-of-line handlers ***/
2015 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2016 This function is called only when `coding->eol_type' is
2017 CODING_EOL_CRLF or CODING_EOL_CR. */
2019 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2020 struct coding_system
*coding
;
2021 unsigned char *source
, *destination
;
2022 int src_bytes
, dst_bytes
;
2025 unsigned char *src
= source
;
2026 unsigned char *src_end
= source
+ src_bytes
;
2027 unsigned char *dst
= destination
;
2028 unsigned char *dst_end
= destination
+ dst_bytes
;
2031 switch (coding
->eol_type
)
2033 case CODING_EOL_CRLF
:
2035 /* Since the maximum bytes produced by each loop is 2, we
2036 subtract 1 from DST_END to assure overflow checking is
2037 necessary only at the head of loop. */
2038 unsigned char *adjusted_dst_end
= dst_end
- 1;
2040 while (src
< src_end
&& dst
< adjusted_dst_end
)
2042 unsigned char *src_base
= src
;
2043 unsigned char c
= *src
++;
2056 coding
->carryover_size
= src
- src_base
;
2057 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2061 *consumed
= src
- source
;
2062 produced
= dst
- destination
;
2067 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2068 bcopy (source
, destination
, produced
);
2069 dst_end
= destination
+ produced
;
2070 while (dst
< dst_end
)
2071 if (*dst
++ == '\r') dst
[-1] = '\n';
2072 *consumed
= produced
;
2075 default: /* i.e. case: CODING_EOL_LF */
2076 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2077 bcopy (source
, destination
, produced
);
2078 *consumed
= produced
;
2085 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2086 format of end-of-line according to `coding->eol_type'. If
2087 `coding->selective' is 1, code '\r' in source text also means
2090 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2091 struct coding_system
*coding
;
2092 unsigned char *source
, *destination
;
2093 int src_bytes
, dst_bytes
;
2096 unsigned char *src
= source
;
2097 unsigned char *dst
= destination
;
2103 switch (coding
->eol_type
)
2106 case CODING_EOL_UNDECIDED
:
2107 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2108 bcopy (source
, destination
, produced
);
2109 if (coding
->selective
)
2113 if (*dst
++ == '\r') dst
[-1] = '\n';
2115 *consumed
= produced
;
2117 case CODING_EOL_CRLF
:
2120 unsigned char *src_end
= source
+ src_bytes
;
2121 unsigned char *dst_end
= destination
+ dst_bytes
;
2122 /* Since the maximum bytes produced by each loop is 2, we
2123 subtract 1 from DST_END to assure overflow checking is
2124 necessary only at the head of loop. */
2125 unsigned char *adjusted_dst_end
= dst_end
- 1;
2127 while (src
< src_end
&& dst
< adjusted_dst_end
)
2130 if (c
== '\n' || (c
== '\r' && coding
->selective
))
2131 *dst
++ = '\r', *dst
++ = '\n';
2135 produced
= dst
- destination
;
2136 *consumed
= src
- source
;
2140 default: /* i.e. case CODING_EOL_CR: */
2141 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2142 bcopy (source
, destination
, produced
);
2146 if (*dst
++ == '\n') dst
[-1] = '\r';
2148 *consumed
= produced
;
2155 /*** 6. C library functions ***/
2157 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2158 has a property `coding-system'. The value of this property is a
2159 vector of length 5 (called as coding-vector). Among elements of
2160 this vector, the first (element[0]) and the fifth (element[4])
2161 carry important information for decoding/encoding. Before
2162 decoding/encoding, this information should be set in fields of a
2163 structure of type `coding_system'.
2165 A value of property `coding-system' can be a symbol of another
2166 subsidiary coding-system. In that case, Emacs gets coding-vector
2169 `element[0]' contains information to be set in `coding->type'. The
2170 value and its meaning is as follows:
2172 0 -- coding_type_emacs_mule
2173 1 -- coding_type_sjis
2174 2 -- coding_type_iso2022
2175 3 -- coding_type_big5
2176 4 -- coding_type_ccl encoder/decoder written in CCL
2177 nil -- coding_type_no_conversion
2178 t -- coding_type_undecided (automatic conversion on decoding,
2179 no-conversion on encoding)
2181 `element[4]' contains information to be set in `coding->flags' and
2182 `coding->spec'. The meaning varies by `coding->type'.
2184 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2185 of length 32 (of which the first 13 sub-elements are used now).
2186 Meanings of these sub-elements are:
2188 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2189 If the value is an integer of valid charset, the charset is
2190 assumed to be designated to graphic register N initially.
2192 If the value is minus, it is a minus value of charset which
2193 reserves graphic register N, which means that the charset is
2194 not designated initially but should be designated to graphic
2195 register N just before encoding a character in that charset.
2197 If the value is nil, graphic register N is never used on
2200 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2201 Each value takes t or nil. See the section ISO2022 of
2202 `coding.h' for more information.
2204 If `coding->type' is `coding_type_big5', element[4] is t to denote
2205 BIG5-ETen or nil to denote BIG5-HKU.
2207 If `coding->type' takes the other value, element[4] is ignored.
2209 Emacs Lisp's coding system also carries information about format of
2210 end-of-line in a value of property `eol-type'. If the value is
2211 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2212 means CODING_EOL_CR. If it is not integer, it should be a vector
2213 of subsidiary coding systems of which property `eol-type' has one
2218 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2219 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2220 is setup so that no conversion is necessary and return -1, else
2224 setup_coding_system (coding_system
, coding
)
2225 Lisp_Object coding_system
;
2226 struct coding_system
*coding
;
2228 Lisp_Object type
, eol_type
;
2230 /* At first, set several fields to default values. */
2231 coding
->require_flushing
= 0;
2232 coding
->last_block
= 0;
2233 coding
->selective
= 0;
2234 coding
->composing
= 0;
2235 coding
->direction
= 0;
2236 coding
->carryover_size
= 0;
2237 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2238 coding
->character_unification_table_for_decode
= Qnil
;
2239 coding
->character_unification_table_for_encode
= Qnil
;
2241 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2243 /* Get value of property `coding-system' until we get a vector.
2244 While doing that, also get values of properties
2245 `post-read-conversion', `pre-write-conversion',
2246 `character-unification-table-for-decode',
2247 `character-unification-table-for-encode' and `eol-type'. */
2248 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2250 if (NILP (coding
->post_read_conversion
))
2251 coding
->post_read_conversion
= Fget (coding_system
,
2252 Qpost_read_conversion
);
2253 if (NILP (coding
->pre_write_conversion
))
2254 coding
->pre_write_conversion
= Fget (coding_system
,
2255 Qpre_write_conversion
);
2256 if (!inhibit_eol_conversion
&& NILP (eol_type
))
2257 eol_type
= Fget (coding_system
, Qeol_type
);
2259 if (NILP (coding
->character_unification_table_for_decode
))
2260 coding
->character_unification_table_for_decode
2261 = Fget (coding_system
, Qcharacter_unification_table_for_decode
);
2263 if (NILP (coding
->character_unification_table_for_encode
))
2264 coding
->character_unification_table_for_encode
2265 = Fget (coding_system
, Qcharacter_unification_table_for_encode
);
2267 coding_system
= Fget (coding_system
, Qcoding_system
);
2270 while (!NILP (coding
->character_unification_table_for_decode
)
2271 && SYMBOLP (coding
->character_unification_table_for_decode
))
2272 coding
->character_unification_table_for_decode
2273 = Fget (coding
->character_unification_table_for_decode
,
2274 Qcharacter_unification_table_for_decode
);
2275 if (!NILP (coding
->character_unification_table_for_decode
)
2276 && !CHAR_TABLE_P (coding
->character_unification_table_for_decode
))
2277 coding
->character_unification_table_for_decode
= Qnil
;
2279 while (!NILP (coding
->character_unification_table_for_encode
)
2280 && SYMBOLP (coding
->character_unification_table_for_encode
))
2281 coding
->character_unification_table_for_encode
2282 = Fget (coding
->character_unification_table_for_encode
,
2283 Qcharacter_unification_table_for_encode
);
2284 if (!NILP (coding
->character_unification_table_for_encode
)
2285 && !CHAR_TABLE_P (coding
->character_unification_table_for_encode
))
2286 coding
->character_unification_table_for_encode
= Qnil
;
2288 if (!VECTORP (coding_system
)
2289 || XVECTOR (coding_system
)->size
!= 5)
2290 goto label_invalid_coding_system
;
2292 if (VECTORP (eol_type
))
2293 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2294 else if (XFASTINT (eol_type
) == 1)
2295 coding
->eol_type
= CODING_EOL_CRLF
;
2296 else if (XFASTINT (eol_type
) == 2)
2297 coding
->eol_type
= CODING_EOL_CR
;
2299 coding
->eol_type
= CODING_EOL_LF
;
2301 type
= XVECTOR (coding_system
)->contents
[0];
2302 switch (XFASTINT (type
))
2305 coding
->type
= coding_type_emacs_mule
;
2309 coding
->type
= coding_type_sjis
;
2313 coding
->type
= coding_type_iso2022
;
2315 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2317 int i
, charset
, default_reg_bits
= 0;
2319 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2320 goto label_invalid_coding_system
;
2322 flags
= XVECTOR (val
)->contents
;
2324 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2325 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2326 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2327 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2328 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2329 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2330 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2331 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2332 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2333 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2334 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
));
2336 /* Invoke graphic register 0 to plane 0. */
2337 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2338 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2339 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2340 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2341 /* Not single shifting at first. */
2342 CODING_SPEC_ISO_SINGLE_SHIFTING(coding
) = 0;
2343 /* Beginning of buffer should also be regarded as bol. */
2344 CODING_SPEC_ISO_BOL(coding
) = 1;
2346 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2347 FLAGS[REG] can be one of below:
2348 integer CHARSET: CHARSET occupies register I,
2349 t: designate nothing to REG initially, but can be used
2351 list of integer, nil, or t: designate the first
2352 element (if integer) to REG initially, the remaining
2353 elements (if integer) is designated to REG on request,
2354 if an element is t, REG can be used by any charset,
2355 nil: REG is never used. */
2356 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2357 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2358 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2359 for (i
= 0; i
< 4; i
++)
2361 if (INTEGERP (flags
[i
])
2362 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2363 || (charset
= get_charset_id (flags
[i
])) >= 0)
2365 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2366 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2368 else if (EQ (flags
[i
], Qt
))
2370 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2371 default_reg_bits
|= 1 << i
;
2373 else if (CONSP (flags
[i
]))
2375 Lisp_Object tail
= flags
[i
];
2377 if (INTEGERP (XCONS (tail
)->car
)
2378 && (charset
= XINT (XCONS (tail
)->car
),
2379 CHARSET_VALID_P (charset
))
2380 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2382 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2383 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2386 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2387 tail
= XCONS (tail
)->cdr
;
2388 while (CONSP (tail
))
2390 if (INTEGERP (XCONS (tail
)->car
)
2391 && (charset
= XINT (XCONS (tail
)->car
),
2392 CHARSET_VALID_P (charset
))
2393 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2394 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2396 else if (EQ (XCONS (tail
)->car
, Qt
))
2397 default_reg_bits
|= 1 << i
;
2398 tail
= XCONS (tail
)->cdr
;
2402 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2404 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2405 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2408 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2410 /* REG 1 can be used only by locking shift in 7-bit env. */
2411 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2412 default_reg_bits
&= ~2;
2413 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2414 /* Without any shifting, only REG 0 and 1 can be used. */
2415 default_reg_bits
&= 3;
2418 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2419 if (CHARSET_VALID_P (charset
)
2420 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2421 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
2423 /* We have not yet decided where to designate CHARSET. */
2424 int reg_bits
= default_reg_bits
;
2426 if (CHARSET_CHARS (charset
) == 96)
2427 /* A charset of CHARS96 can't be designated to REG 0. */
2431 /* There exist some default graphic register. */
2432 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2434 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2436 /* We anyway have to designate CHARSET to somewhere. */
2437 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2438 = (CHARSET_CHARS (charset
) == 94
2440 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2441 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2443 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2447 coding
->require_flushing
= 1;
2451 coding
->type
= coding_type_big5
;
2453 = (NILP (XVECTOR (coding_system
)->contents
[4])
2454 ? CODING_FLAG_BIG5_HKU
2455 : CODING_FLAG_BIG5_ETEN
);
2459 coding
->type
= coding_type_ccl
;
2461 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2463 && VECTORP (XCONS (val
)->car
)
2464 && VECTORP (XCONS (val
)->cdr
))
2466 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2467 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2470 goto label_invalid_coding_system
;
2472 coding
->require_flushing
= 1;
2477 coding
->type
= coding_type_undecided
;
2479 coding
->type
= coding_type_no_conversion
;
2484 label_invalid_coding_system
:
2485 coding
->type
= coding_type_no_conversion
;
2486 coding
->eol_type
= CODING_EOL_LF
;
2487 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2492 /* Emacs has a mechanism to automatically detect a coding system if it
2493 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2494 it's impossible to distinguish some coding systems accurately
2495 because they use the same range of codes. So, at first, coding
2496 systems are categorized into 7, those are:
2498 o coding-category-emacs-mule
2500 The category for a coding system which has the same code range
2501 as Emacs' internal format. Assigned the coding-system (Lisp
2502 symbol) `emacs-mule' by default.
2504 o coding-category-sjis
2506 The category for a coding system which has the same code range
2507 as SJIS. Assigned the coding-system (Lisp
2508 symbol) `japanese-shift-jis' by default.
2510 o coding-category-iso-7
2512 The category for a coding system which has the same code range
2513 as ISO2022 of 7-bit environment. This doesn't use any locking
2514 shift and single shift functions. Assigned the coding-system
2515 (Lisp symbol) `iso-2022-7bit' by default.
2517 o coding-category-iso-8-1
2519 The category for a coding system which has the same code range
2520 as ISO2022 of 8-bit environment and graphic plane 1 used only
2521 for DIMENSION1 charset. This doesn't use any locking shift
2522 and single shift functions. Assigned the coding-system (Lisp
2523 symbol) `iso-latin-1' by default.
2525 o coding-category-iso-8-2
2527 The category for a coding system which has the same code range
2528 as ISO2022 of 8-bit environment and graphic plane 1 used only
2529 for DIMENSION2 charset. This doesn't use any locking shift
2530 and single shift functions. Assigned the coding-system (Lisp
2531 symbol) `japanese-iso-8bit' by default.
2533 o coding-category-iso-7-else
2535 The category for a coding system which has the same code range
2536 as ISO2022 of 7-bit environemnt but uses locking shift or
2537 single shift functions. Assigned the coding-system (Lisp
2538 symbol) `iso-2022-7bit-lock' by default.
2540 o coding-category-iso-8-else
2542 The category for a coding system which has the same code range
2543 as ISO2022 of 8-bit environemnt but uses locking shift or
2544 single shift functions. Assigned the coding-system (Lisp
2545 symbol) `iso-2022-8bit-ss2' by default.
2547 o coding-category-big5
2549 The category for a coding system which has the same code range
2550 as BIG5. Assigned the coding-system (Lisp symbol)
2551 `cn-big5' by default.
2553 o coding-category-binary
2555 The category for a coding system not categorized in any of the
2556 above. Assigned the coding-system (Lisp symbol)
2557 `no-conversion' by default.
2559 Each of them is a Lisp symbol and the value is an actual
2560 `coding-system's (this is also a Lisp symbol) assigned by a user.
2561 What Emacs does actually is to detect a category of coding system.
2562 Then, it uses a `coding-system' assigned to it. If Emacs can't
2563 decide only one possible category, it selects a category of the
2564 highest priority. Priorities of categories are also specified by a
2565 user in a Lisp variable `coding-category-list'.
2569 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2570 If it detects possible coding systems, return an integer in which
2571 appropriate flag bits are set. Flag bits are defined by macros
2572 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2575 detect_coding_mask (src
, src_bytes
)
2579 register unsigned char c
;
2580 unsigned char *src_end
= src
+ src_bytes
;
2583 /* At first, skip all ASCII characters and control characters except
2584 for three ISO2022 specific control characters. */
2585 label_loop_detect_coding
:
2586 while (src
< src_end
)
2590 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2596 /* We found nothing other than ASCII. There's nothing to do. */
2597 return CODING_CATEGORY_MASK_ANY
;
2599 /* The text seems to be encoded in some multilingual coding system.
2600 Now, try to find in which coding system the text is encoded. */
2603 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2604 /* C is an ISO2022 specific control code of C0. */
2605 mask
= detect_coding_iso2022 (src
, src_end
);
2607 if (mask
== CODING_CATEGORY_MASK_ANY
)
2608 /* No valid ISO2022 code follows C. Try again. */
2609 goto label_loop_detect_coding
;
2611 else if (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
2612 /* C is an ISO2022 specific control code of C1,
2613 or the first byte of SJIS's 2-byte character code,
2614 or a leading code of Emacs. */
2615 mask
= (detect_coding_iso2022 (src
, src_end
)
2616 | detect_coding_sjis (src
, src_end
)
2617 | detect_coding_emacs_mule (src
, src_end
)
2618 | CODING_CATEGORY_MASK_BINARY
);
2620 else if (c
== ISO_CODE_CSI
2623 || (src
+ 1 < src_end
2625 && (*src
== '0' || *src
== '1' || *src
== '2')))))
2626 /* C is an ISO2022's control-sequence-introducer. */
2627 mask
= (detect_coding_iso2022 (src
, src_end
)
2628 | detect_coding_sjis (src
, src_end
)
2629 | detect_coding_emacs_mule (src
, src_end
)
2630 | CODING_CATEGORY_MASK_BINARY
);
2633 /* C is the first byte of SJIS character code,
2634 or a leading-code of Emacs. */
2635 mask
= (detect_coding_sjis (src
, src_end
)
2636 | detect_coding_emacs_mule (src
, src_end
)
2637 | CODING_CATEGORY_MASK_BINARY
);
2640 /* C is a character of ISO2022 in graphic plane right,
2641 or a SJIS's 1-byte character code (i.e. JISX0201),
2642 or the first byte of BIG5's 2-byte code. */
2643 mask
= (detect_coding_iso2022 (src
, src_end
)
2644 | detect_coding_sjis (src
, src_end
)
2645 | detect_coding_big5 (src
, src_end
)
2646 | CODING_CATEGORY_MASK_BINARY
);
2651 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2652 The information of the detected coding system is set in CODING. */
2655 detect_coding (coding
, src
, src_bytes
)
2656 struct coding_system
*coding
;
2660 int mask
= detect_coding_mask (src
, src_bytes
);
2663 if (mask
== CODING_CATEGORY_MASK_ANY
)
2664 /* We found nothing other than ASCII. There's nothing to do. */
2668 /* The source text seems to be encoded in unknown coding system.
2669 Emacs regards the category of such a kind of coding system as
2670 `coding-category-binary'. We assume that a user has assigned
2671 an appropriate coding system for a `coding-category-binary'. */
2672 idx
= CODING_CATEGORY_IDX_BINARY
;
2675 /* We found some plausible coding systems. Let's use a coding
2676 system of the highest priority. */
2677 Lisp_Object val
= Vcoding_category_list
;
2682 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2683 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2685 val
= XCONS (val
)->cdr
;
2692 /* For unknown reason, `Vcoding_category_list' contains none
2693 of found categories. Let's use any of them. */
2694 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2695 if (mask
& (1 << idx
))
2699 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2702 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2703 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2704 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2707 detect_eol_type (src
, src_bytes
)
2711 unsigned char *src_end
= src
+ src_bytes
;
2714 while (src
< src_end
)
2718 return CODING_EOL_LF
;
2721 if (src
< src_end
&& *src
== '\n')
2722 return CODING_EOL_CRLF
;
2724 return CODING_EOL_CR
;
2727 return CODING_EOL_UNDECIDED
;
2730 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2731 is encoded. If it detects an appropriate format of end-of-line, it
2732 sets the information in *CODING. */
2735 detect_eol (coding
, src
, src_bytes
)
2736 struct coding_system
*coding
;
2741 int eol_type
= detect_eol_type (src
, src_bytes
);
2743 if (eol_type
== CODING_EOL_UNDECIDED
)
2744 /* We found no end-of-line in the source text. */
2747 val
= Fget (coding
->symbol
, Qeol_type
);
2748 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2749 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2752 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2753 decoding, it may detect coding system and format of end-of-line if
2754 those are not yet decided. */
2757 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2758 struct coding_system
*coding
;
2759 unsigned char *source
, *destination
;
2760 int src_bytes
, dst_bytes
;
2771 if (coding
->type
== coding_type_undecided
)
2772 detect_coding (coding
, source
, src_bytes
);
2774 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
2775 detect_eol (coding
, source
, src_bytes
);
2777 coding
->carryover_size
= 0;
2778 switch (coding
->type
)
2780 case coding_type_no_conversion
:
2781 label_no_conversion
:
2782 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2783 bcopy (source
, destination
, produced
);
2784 *consumed
= produced
;
2787 case coding_type_emacs_mule
:
2788 case coding_type_undecided
:
2789 if (coding
->eol_type
== CODING_EOL_LF
2790 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2791 goto label_no_conversion
;
2792 produced
= decode_eol (coding
, source
, destination
,
2793 src_bytes
, dst_bytes
, consumed
);
2796 case coding_type_sjis
:
2797 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2798 src_bytes
, dst_bytes
, consumed
,
2802 case coding_type_iso2022
:
2803 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2804 src_bytes
, dst_bytes
, consumed
);
2807 case coding_type_big5
:
2808 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2809 src_bytes
, dst_bytes
, consumed
,
2813 case coding_type_ccl
:
2814 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2815 src_bytes
, dst_bytes
, consumed
);
2822 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2825 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2826 struct coding_system
*coding
;
2827 unsigned char *source
, *destination
;
2828 int src_bytes
, dst_bytes
;
2833 switch (coding
->type
)
2835 case coding_type_no_conversion
:
2836 label_no_conversion
:
2837 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2840 bcopy (source
, destination
, produced
);
2841 if (coding
->selective
)
2843 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2845 if (*p
++ == '\015') p
[-1] = '\n';
2848 *consumed
= produced
;
2851 case coding_type_emacs_mule
:
2852 case coding_type_undecided
:
2853 if (coding
->eol_type
== CODING_EOL_LF
2854 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2855 goto label_no_conversion
;
2856 produced
= encode_eol (coding
, source
, destination
,
2857 src_bytes
, dst_bytes
, consumed
);
2860 case coding_type_sjis
:
2861 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2862 src_bytes
, dst_bytes
, consumed
,
2866 case coding_type_iso2022
:
2867 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2868 src_bytes
, dst_bytes
, consumed
);
2871 case coding_type_big5
:
2872 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2873 src_bytes
, dst_bytes
, consumed
,
2877 case coding_type_ccl
:
2878 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2879 src_bytes
, dst_bytes
, consumed
);
2886 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2888 /* Return maximum size (bytes) of a buffer enough for decoding
2889 SRC_BYTES of text encoded in CODING. */
2892 decoding_buffer_size (coding
, src_bytes
)
2893 struct coding_system
*coding
;
2898 if (coding
->type
== coding_type_iso2022
)
2900 else if (coding
->type
== coding_type_ccl
)
2901 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
2905 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2908 /* Return maximum size (bytes) of a buffer enough for encoding
2909 SRC_BYTES of text to CODING. */
2912 encoding_buffer_size (coding
, src_bytes
)
2913 struct coding_system
*coding
;
2918 if (coding
->type
== coding_type_ccl
)
2919 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
2923 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2926 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2927 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2930 char *conversion_buffer
;
2931 int conversion_buffer_size
;
2933 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2934 or decoding. Sufficient memory is allocated automatically. If we
2935 run out of memory, return NULL. */
2938 get_conversion_buffer (size
)
2941 if (size
> conversion_buffer_size
)
2944 int real_size
= conversion_buffer_size
* 2;
2946 while (real_size
< size
) real_size
*= 2;
2947 buf
= (char *) xmalloc (real_size
);
2948 xfree (conversion_buffer
);
2949 conversion_buffer
= buf
;
2950 conversion_buffer_size
= real_size
;
2952 return conversion_buffer
;
2957 /*** 7. Emacs Lisp library functions ***/
2959 DEFUN ("coding-system-spec", Fcoding_system_spec
, Scoding_system_spec
,
2961 "Return coding-spec of CODING-SYSTEM.\n\
2962 If CODING-SYSTEM is not a valid coding-system, return nil.")
2966 while (SYMBOLP (obj
) && !NILP (obj
))
2967 obj
= Fget (obj
, Qcoding_system
);
2968 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
2972 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
2973 "Return t if OBJECT is nil or a coding-system.\n\
2974 See document of make-coding-system for coding-system object.")
2978 return ((NILP (obj
) || !NILP (Fcoding_system_spec (obj
))) ? Qt
: Qnil
);
2981 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
2982 Sread_non_nil_coding_system
, 1, 1, 0,
2983 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2990 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_spec
,
2991 Qt
, Qnil
, Qnil
, Qnil
);
2993 while (XSTRING (val
)->size
== 0);
2994 return (Fintern (val
, Qnil
));
2997 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
2998 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3002 Lisp_Object val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
3003 Qt
, Qnil
, Qnil
, Qnil
);
3004 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
3007 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
3009 "Check validity of CODING-SYSTEM.\n\
3010 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3011 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3012 The value of property should be a vector of length 5.")
3014 Lisp_Object coding_system
;
3016 CHECK_SYMBOL (coding_system
, 0);
3017 if (!NILP (Fcoding_system_p (coding_system
)))
3018 return coding_system
;
3020 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
3023 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
3025 "Detect coding system of the text in the region between START and END.\n\
3026 Return a list of possible coding systems ordered by priority.\n\
3027 If only ASCII characters are found, it returns `undecided'\n\
3028 or its subsidiary coding system according to a detected end-of-line format.")
3032 int coding_mask
, eol_type
;
3036 validate_region (&b
, &e
);
3037 beg
= XINT (b
), end
= XINT (e
);
3038 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
3040 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
3041 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
3043 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
3045 val
= intern ("undecided");
3046 if (eol_type
!= CODING_EOL_UNDECIDED
)
3048 Lisp_Object val2
= Fget (val
, Qeol_type
);
3050 val
= XVECTOR (val2
)->contents
[eol_type
];
3057 /* At first, gather possible coding-systems in VAL in a reverse
3060 for (val2
= Vcoding_category_list
;
3062 val2
= XCONS (val2
)->cdr
)
3065 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
3066 if (coding_mask
& (1 << idx
))
3067 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
3070 /* Then, change the order of the list, while getting subsidiary
3074 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
3076 if (eol_type
== CODING_EOL_UNDECIDED
)
3077 val
= Fcons (XCONS (val2
)->car
, val
);
3080 Lisp_Object val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
3082 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
3084 val
= Fcons (XCONS (val2
)->car
, val
);
3092 /* Scan text in the region between *BEGP and *ENDP, skip characters
3093 which we never have to encode to (iff ENCODEP is 1) or decode from
3094 coding system CODING at the head and tail, then set BEGP and ENDP
3095 to the addresses of start and end of the text we actually convert. */
3098 shrink_conversion_area (begp
, endp
, coding
, encodep
)
3099 unsigned char **begp
, **endp
;
3100 struct coding_system
*coding
;
3103 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
3105 if (coding
->eol_type
!= CODING_EOL_LF
3106 && coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3107 /* Since we anyway have to convert end-of-line format, it is not
3108 worth skipping at most 100 bytes or so. */
3111 if (encodep
) /* for encoding */
3113 switch (coding
->type
)
3115 case coding_type_no_conversion
:
3116 case coding_type_emacs_mule
:
3117 case coding_type_undecided
:
3118 /* We need no conversion. */
3121 case coding_type_ccl
:
3122 /* We can't skip any data. */
3124 case coding_type_iso2022
:
3125 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3127 unsigned char *bol
= beg_addr
;
3128 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
3131 if (*(beg_addr
- 1) == '\n')
3135 goto label_skip_tail
;
3139 /* We can skip all ASCII characters at the head and tail. */
3140 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3142 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3146 else /* for decoding */
3148 switch (coding
->type
)
3150 case coding_type_no_conversion
:
3151 /* We need no conversion. */
3154 case coding_type_emacs_mule
:
3155 if (coding
->eol_type
== CODING_EOL_LF
)
3157 /* We need no conversion. */
3161 /* We can skip all but carriage-return. */
3162 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
3163 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
3165 case coding_type_sjis
:
3166 case coding_type_big5
:
3167 /* We can skip all ASCII characters at the head. */
3168 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3169 /* We can skip all ASCII characters at the tail except for
3170 the second byte of SJIS or BIG5 code. */
3171 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3172 if (end_addr
!= *endp
)
3175 case coding_type_ccl
:
3176 /* We can't skip any data. */
3178 default: /* i.e. case coding_type_iso2022: */
3182 /* We can skip all ASCII characters except for a few
3183 control codes at the head. */
3184 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
3185 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3186 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
3197 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3198 text between B and E. B and E are buffer position. */
3201 code_convert_region (b
, e
, coding
, encodep
)
3203 struct coding_system
*coding
;
3206 int beg
, end
, len
, consumed
, produced
;
3208 unsigned char *begp
, *endp
;
3211 validate_region (&b
, &e
);
3212 beg
= XINT (b
), end
= XINT (e
);
3213 if (beg
< GPT
&& end
>= GPT
)
3216 if (encodep
&& !NILP (coding
->pre_write_conversion
))
3218 /* We must call a pre-conversion function which may put a new
3219 text to be converted in a new buffer. */
3220 struct buffer
*old
= current_buffer
, *new;
3223 call2 (coding
->pre_write_conversion
, b
, e
);
3224 if (old
!= current_buffer
)
3226 /* Replace the original text by the text just generated. */
3228 new = current_buffer
;
3229 set_buffer_internal (old
);
3230 del_range (beg
, end
);
3231 insert_from_buffer (new, 1, len
, 0);
3236 /* We may be able to shrink the conversion region. */
3237 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
3238 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3241 /* We need no conversion. */
3245 beg
+= begp
- POS_ADDR (beg
);
3246 end
= beg
+ (endp
- begp
);
3249 len
= encoding_buffer_size (coding
, end
- beg
);
3251 len
= decoding_buffer_size (coding
, end
- beg
);
3252 buf
= get_conversion_buffer (len
);
3254 coding
->last_block
= 1;
3256 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3258 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3261 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3264 insert (buf
, produced
);
3265 del_range (PT
, PT
+ end
- beg
);
3267 pos
= PT
+ (pos
- end
);
3273 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3275 /* We must call a post-conversion function which may alter
3276 the text just converted. */
3281 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3282 CHECK_NUMBER (insval
, 0);
3283 len
= XINT (insval
);
3286 return make_number (len
);
3290 code_convert_string (str
, coding
, encodep
, nocopy
)
3291 Lisp_Object str
, nocopy
;
3292 struct coding_system
*coding
;
3295 int len
, consumed
, produced
;
3297 unsigned char *begp
, *endp
;
3298 int head_skip
, tail_skip
;
3299 struct gcpro gcpro1
;
3301 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3302 || !encodep
&& !NILP (coding
->post_read_conversion
))
3304 /* Since we have to call Lisp functions which assume target text
3305 is in a buffer, after setting a temporary buffer, call
3306 code_convert_region. */
3307 int count
= specpdl_ptr
- specpdl
;
3308 int len
= XSTRING (str
)->size
;
3310 struct buffer
*old
= current_buffer
;
3312 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3313 temp_output_buffer_setup (" *code-converting-work*");
3314 set_buffer_internal (XBUFFER (Vstandard_output
));
3315 insert_from_string (str
, 0, len
, 0);
3316 code_convert_region (make_number (BEGV
), make_number (ZV
),
3318 result
= make_buffer_string (BEGV
, ZV
, 0);
3319 set_buffer_internal (old
);
3320 return unbind_to (count
, result
);
3323 /* We may be able to shrink the conversion region. */
3324 begp
= XSTRING (str
)->data
;
3325 endp
= begp
+ XSTRING (str
)->size
;
3326 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3329 /* We need no conversion. */
3330 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3332 head_skip
= begp
- XSTRING (str
)->data
;
3333 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3338 len
= encoding_buffer_size (coding
, endp
- begp
);
3340 len
= decoding_buffer_size (coding
, endp
- begp
);
3341 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3343 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3344 coding
->last_block
= 1;
3346 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3347 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3348 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3349 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3350 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3351 buf
+ head_skip
+ produced
,
3356 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3359 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3360 3, 3, "r\nzCoding system: ",
3361 "Decode current region by specified coding system.\n\
3362 When called from a program, takes three arguments:\n\
3363 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3364 Return length of decoded text.")
3365 (b
, e
, coding_system
)
3366 Lisp_Object b
, e
, coding_system
;
3368 struct coding_system coding
;
3370 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3371 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3372 CHECK_SYMBOL (coding_system
, 2);
3374 if (NILP (coding_system
))
3375 return make_number (XFASTINT (e
) - XFASTINT (b
));
3376 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3377 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3379 return code_convert_region (b
, e
, &coding
, 0);
3382 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3383 3, 3, "r\nzCoding system: ",
3384 "Encode current region by specified coding system.\n\
3385 When called from a program, takes three arguments:\n\
3386 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3387 Return length of encoded text.")
3388 (b
, e
, coding_system
)
3389 Lisp_Object b
, e
, coding_system
;
3391 struct coding_system coding
;
3393 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3394 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3395 CHECK_SYMBOL (coding_system
, 2);
3397 if (NILP (coding_system
))
3398 return make_number (XFASTINT (e
) - XFASTINT (b
));
3399 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3400 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3402 return code_convert_region (b
, e
, &coding
, 1);
3405 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3407 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3408 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3410 (string
, coding_system
, nocopy
)
3411 Lisp_Object string
, coding_system
, nocopy
;
3413 struct coding_system coding
;
3415 CHECK_STRING (string
, 0);
3416 CHECK_SYMBOL (coding_system
, 1);
3418 if (NILP (coding_system
))
3419 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3420 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3421 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3423 return code_convert_string (string
, &coding
, 0, nocopy
);
3426 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3428 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3429 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3431 (string
, coding_system
, nocopy
)
3432 Lisp_Object string
, coding_system
, nocopy
;
3434 struct coding_system coding
;
3436 CHECK_STRING (string
, 0);
3437 CHECK_SYMBOL (coding_system
, 1);
3439 if (NILP (coding_system
))
3440 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3441 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3442 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3444 return code_convert_string (string
, &coding
, 1, nocopy
);
3447 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3448 "Decode a JISX0208 character of shift-jis encoding.\n\
3449 CODE is the character code in SJIS.\n\
3450 Return the corresponding character.")
3454 unsigned char c1
, c2
, s1
, s2
;
3457 CHECK_NUMBER (code
, 0);
3458 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3459 DECODE_SJIS (s1
, s2
, c1
, c2
);
3460 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3464 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3465 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3466 Return the corresponding character code in SJIS.")
3470 int charset
, c1
, c2
, s1
, s2
;
3473 CHECK_NUMBER (ch
, 0);
3474 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3475 if (charset
== charset_jisx0208
)
3477 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3478 XSETFASTINT (val
, (s1
<< 8) | s2
);
3481 XSETFASTINT (val
, 0);
3485 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3486 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3487 CODE is the character code in BIG5.\n\
3488 Return the corresponding character.")
3493 unsigned char b1
, b2
, c1
, c2
;
3496 CHECK_NUMBER (code
, 0);
3497 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3498 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3499 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3503 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3504 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3505 Return the corresponding character code in Big5.")
3509 int charset
, c1
, c2
, b1
, b2
;
3512 CHECK_NUMBER (ch
, 0);
3513 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3514 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3516 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3517 XSETFASTINT (val
, (b1
<< 8) | b2
);
3520 XSETFASTINT (val
, 0);
3524 DEFUN ("set-terminal-coding-system-internal",
3525 Fset_terminal_coding_system_internal
,
3526 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
3528 Lisp_Object coding_system
;
3530 CHECK_SYMBOL (coding_system
, 0);
3531 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3535 DEFUN ("terminal-coding-system",
3536 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3537 "Return coding-system of your terminal.")
3540 return terminal_coding
.symbol
;
3543 DEFUN ("set-keyboard-coding-system-internal",
3544 Fset_keyboard_coding_system_internal
,
3545 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
3547 Lisp_Object coding_system
;
3549 CHECK_SYMBOL (coding_system
, 0);
3550 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3554 DEFUN ("keyboard-coding-system",
3555 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3556 "Return coding-system of what is sent from terminal keyboard.")
3559 return keyboard_coding
.symbol
;
3563 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
3564 Sfind_operation_coding_system
, 1, MANY
, 0,
3565 "Choose a coding system for an operation based on the target name.\n\
3566 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3567 DECODING-SYSTEM is the coding system to use for decoding\n\
3568 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3569 for encoding (in case OPERATION does encoding).\n\
3571 The first argument OPERATION specifies an I/O primitive:\n\
3572 For file I/O, `insert-file-contents' or `write-region'.\n\
3573 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3574 For network I/O, `open-network-stream'.\n\
3576 The remaining arguments should be the same arguments that were passed\n\
3577 to the primitive. Depending on which primitive, one of those arguments\n\
3578 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3579 whichever argument specifies the file name is TARGET.\n\
3581 TARGET has a meaning which depends on OPERATION:\n\
3582 For file I/O, TARGET is a file name.\n\
3583 For process I/O, TARGET is a process name.\n\
3584 For network I/O, TARGET is a service name or a port number\n\
3586 This function looks up what specified for TARGET in,\n\
3587 `file-coding-system-alist', `process-coding-system-alist',\n\
3588 or `network-coding-system-alist' depending on OPERATION.\n\
3589 They may specify a coding system, a cons of coding systems,\n\
3590 or a function symbol to call.\n\
3591 In the last case, we call the function with one argument,\n\
3592 which is a list of all the arguments given to this function.")
3597 Lisp_Object operation
, target_idx
, target
, val
;
3598 register Lisp_Object chain
;
3601 error ("Too few arguments");
3602 operation
= args
[0];
3603 if (!SYMBOLP (operation
)
3604 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3605 error ("Invalid first arguement");
3606 if (nargs
< 1 + XINT (target_idx
))
3607 error ("Too few arguments for operation: %s",
3608 XSYMBOL (operation
)->name
->data
);
3609 target
= args
[XINT (target_idx
) + 1];
3610 if (!(STRINGP (target
)
3611 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3612 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3614 chain
= ((EQ (operation
, Qinsert_file_contents
)
3615 || EQ (operation
, Qwrite_region
))
3616 ? Vfile_coding_system_alist
3617 : (EQ (operation
, Qopen_network_stream
)
3618 ? Vnetwork_coding_system_alist
3619 : Vprocess_coding_system_alist
));
3623 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3625 Lisp_Object elt
= XCONS (chain
)->car
;
3628 && ((STRINGP (target
)
3629 && STRINGP (XCONS (elt
)->car
)
3630 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3631 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3633 val
= XCONS (elt
)->cdr
;
3636 if (! SYMBOLP (val
))
3638 if (! NILP (Fcoding_system_p (val
)))
3639 return Fcons (val
, val
);
3640 if (!NILP (Ffboundp (val
)))
3641 return call1 (val
, Flist (nargs
, args
));
3651 /*** 8. Post-amble ***/
3657 /* Emacs' internal format specific initialize routine. */
3658 for (i
= 0; i
<= 0x20; i
++)
3659 emacs_code_class
[i
] = EMACS_control_code
;
3660 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3661 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3662 for (i
= 0x21 ; i
< 0x7F; i
++)
3663 emacs_code_class
[i
] = EMACS_ascii_code
;
3664 emacs_code_class
[0x7F] = EMACS_control_code
;
3665 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3666 for (i
= 0x81; i
< 0xFF; i
++)
3667 emacs_code_class
[i
] = EMACS_invalid_code
;
3668 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3669 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3670 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3671 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3673 /* ISO2022 specific initialize routine. */
3674 for (i
= 0; i
< 0x20; i
++)
3675 iso_code_class
[i
] = ISO_control_code
;
3676 for (i
= 0x21; i
< 0x7F; i
++)
3677 iso_code_class
[i
] = ISO_graphic_plane_0
;
3678 for (i
= 0x80; i
< 0xA0; i
++)
3679 iso_code_class
[i
] = ISO_control_code
;
3680 for (i
= 0xA1; i
< 0xFF; i
++)
3681 iso_code_class
[i
] = ISO_graphic_plane_1
;
3682 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3683 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3684 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3685 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3686 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3687 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3688 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3689 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3690 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3691 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3693 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3694 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3696 setup_coding_system (Qnil
, &keyboard_coding
);
3697 setup_coding_system (Qnil
, &terminal_coding
);
3699 #if defined (MSDOS) || defined (WINDOWSNT)
3700 system_eol_type
= CODING_EOL_CRLF
;
3702 system_eol_type
= CODING_EOL_LF
;
3710 Qtarget_idx
= intern ("target-idx");
3711 staticpro (&Qtarget_idx
);
3713 /* Target FILENAME is the first argument. */
3714 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3715 /* Target FILENAME is the third argument. */
3716 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3718 Qcall_process
= intern ("call-process");
3719 staticpro (&Qcall_process
);
3720 /* Target PROGRAM is the first argument. */
3721 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3723 Qcall_process_region
= intern ("call-process-region");
3724 staticpro (&Qcall_process_region
);
3725 /* Target PROGRAM is the third argument. */
3726 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3728 Qstart_process
= intern ("start-process");
3729 staticpro (&Qstart_process
);
3730 /* Target PROGRAM is the third argument. */
3731 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3733 Qopen_network_stream
= intern ("open-network-stream");
3734 staticpro (&Qopen_network_stream
);
3735 /* Target SERVICE is the fourth argument. */
3736 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3738 Qcoding_system
= intern ("coding-system");
3739 staticpro (&Qcoding_system
);
3741 Qeol_type
= intern ("eol-type");
3742 staticpro (&Qeol_type
);
3744 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3745 staticpro (&Qbuffer_file_coding_system
);
3747 Qpost_read_conversion
= intern ("post-read-conversion");
3748 staticpro (&Qpost_read_conversion
);
3750 Qpre_write_conversion
= intern ("pre-write-conversion");
3751 staticpro (&Qpre_write_conversion
);
3753 Qcoding_system_spec
= intern ("coding-system-spec");
3754 staticpro (&Qcoding_system_spec
);
3756 Qcoding_system_p
= intern ("coding-system-p");
3757 staticpro (&Qcoding_system_p
);
3759 Qcoding_system_error
= intern ("coding-system-error");
3760 staticpro (&Qcoding_system_error
);
3762 Fput (Qcoding_system_error
, Qerror_conditions
,
3763 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3764 Fput (Qcoding_system_error
, Qerror_message
,
3765 build_string ("Invalid coding system"));
3767 Qcoding_category_index
= intern ("coding-category-index");
3768 staticpro (&Qcoding_category_index
);
3772 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3774 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3775 staticpro (&coding_category_table
[i
]);
3776 Fput (coding_category_table
[i
], Qcoding_category_index
,
3781 Qcharacter_unification_table
= intern ("character-unification-table");
3782 staticpro (&Qcharacter_unification_table
);
3783 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
3786 Qcharacter_unification_table_for_decode
3787 = intern ("character-unification-table-for-decode");
3788 staticpro (&Qcharacter_unification_table_for_decode
);
3790 Qcharacter_unification_table_for_encode
3791 = intern ("character-unification-table-for-encode");
3792 staticpro (&Qcharacter_unification_table_for_encode
);
3794 Qemacs_mule
= intern ("emacs-mule");
3795 staticpro (&Qemacs_mule
);
3797 defsubr (&Scoding_system_spec
);
3798 defsubr (&Scoding_system_p
);
3799 defsubr (&Sread_coding_system
);
3800 defsubr (&Sread_non_nil_coding_system
);
3801 defsubr (&Scheck_coding_system
);
3802 defsubr (&Sdetect_coding_region
);
3803 defsubr (&Sdecode_coding_region
);
3804 defsubr (&Sencode_coding_region
);
3805 defsubr (&Sdecode_coding_string
);
3806 defsubr (&Sencode_coding_string
);
3807 defsubr (&Sdecode_sjis_char
);
3808 defsubr (&Sencode_sjis_char
);
3809 defsubr (&Sdecode_big5_char
);
3810 defsubr (&Sencode_big5_char
);
3811 defsubr (&Sset_terminal_coding_system_internal
);
3812 defsubr (&Sterminal_coding_system
);
3813 defsubr (&Sset_keyboard_coding_system_internal
);
3814 defsubr (&Skeyboard_coding_system
);
3815 defsubr (&Sfind_operation_coding_system
);
3817 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3818 "List of coding-categories (symbols) ordered by priority.");
3822 Vcoding_category_list
= Qnil
;
3823 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3824 Vcoding_category_list
3825 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3828 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3829 "Specify the coding system for read operations.\n\
3830 It is useful to bind this variable with `let', but do not set it globally.
3831 If the value is a coding system, it is used for decoding on read operation.\n\
3832 If not, an appropriate element is used from one of the coding system alists:\n\
3833 There are three such tables, `file-coding-system-alist',\n\
3834 `process-coding-system-alist', and `network-coding-system-alist'.");
3835 Vcoding_system_for_read
= Qnil
;
3837 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
3838 "Specify the coding system for write operations.\n\
3839 It is useful to bind this variable with `let', but do not set it globally.
3840 If the value is a coding system, it is used for encoding on write operation.\n\
3841 If not, an appropriate element is used from one of the coding system alists:\n\
3842 There are three such tables, `file-coding-system-alist',\n\
3843 `process-coding-system-alist', and `network-coding-system-alist'.");
3844 Vcoding_system_for_write
= Qnil
;
3846 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
3847 "Coding system used in the latest file or process I/O.");
3848 Vlast_coding_system_used
= Qnil
;
3850 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
3851 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3852 inhibit_eol_conversion
= 0;
3854 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
3855 "Alist to decide a coding system to use for a file I/O operation.\n\
3856 The format is ((PATTERN . VAL) ...),\n\
3857 where PATTERN is a regular expression matching a file name,\n\
3858 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3859 If VAL is a coding system, it is used for both decoding and encoding\n\
3860 the file contents.\n\
3861 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3862 and the cdr part is used for encoding.\n\
3863 If VAL is a function symbol, the function must return a coding system\n\
3864 or a cons of coding systems which are used as above.\n\
3866 See also the function `find-operation-coding-system'.");
3867 Vfile_coding_system_alist
= Qnil
;
3869 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
3870 "Alist to decide a coding system to use for a process I/O operation.\n\
3871 The format is ((PATTERN . VAL) ...),\n\
3872 where PATTERN is a regular expression matching a program name,\n\
3873 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3874 If VAL is a coding system, it is used for both decoding what received\n\
3875 from the program and encoding what sent to the program.\n\
3876 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3877 and the cdr part is used for encoding.\n\
3878 If VAL is a function symbol, the function must return a coding system\n\
3879 or a cons of coding systems which are used as above.\n\
3881 See also the function `find-operation-coding-system'.");
3882 Vprocess_coding_system_alist
= Qnil
;
3884 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
3885 "Alist to decide a coding system to use for a network I/O operation.\n\
3886 The format is ((PATTERN . VAL) ...),\n\
3887 where PATTERN is a regular expression matching a network service name\n\
3888 or is a port number to connect to,\n\
3889 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3890 If VAL is a coding system, it is used for both decoding what received\n\
3891 from the network stream and encoding what sent to the network stream.\n\
3892 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3893 and the cdr part is used for encoding.\n\
3894 If VAL is a function symbol, the function must return a coding system\n\
3895 or a cons of coding systems which are used as above.\n\
3897 See also the function `find-operation-coding-system'.");
3898 Vnetwork_coding_system_alist
= Qnil
;
3900 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
3901 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3902 eol_mnemonic_unix
= ':';
3904 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
3905 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3906 eol_mnemonic_dos
= '\\';
3908 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
3909 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3910 eol_mnemonic_mac
= '/';
3912 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
3913 "Mnemonic character indicating end-of-line format is not yet decided.");
3914 eol_mnemonic_undecided
= ':';
3916 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
3917 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3918 Venable_character_unification
= Qt
;
3920 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3921 &Vstandard_character_unification_table_for_decode
,
3922 "Table for unifying characters when reading.");
3923 Vstandard_character_unification_table_for_decode
= Qnil
;
3925 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3926 &Vstandard_character_unification_table_for_encode
,
3927 "Table for unifying characters when writing.");
3928 Vstandard_character_unification_table_for_encode
= Qnil
;
3930 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
3931 "Alist of charsets vs revision numbers.\n\
3932 While encoding, if a charset (car part of an element) is found,\n\
3933 designate it with the escape sequence identifing revision (cdr part of the element).");
3934 Vcharset_revision_alist
= Qnil
;
3936 DEFVAR_LISP ("default-process-coding-system",
3937 &Vdefault_process_coding_system
,
3938 "Cons of coding systems used for process I/O by default.\n\
3939 The car part is used for decoding a process output,\n\
3940 the cdr part is used for encoding a text to be sent to a process.");
3941 Vdefault_process_coding_system
= Qnil
;