1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
107 detect_coding_emacs_mule (src
, src_end
)
108 unsigned char *src
, *src_end
;
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
123 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
124 struct coding_system
*coding
;
125 unsigned char *source
, *destination
;
126 int src_bytes
, dst_bytes
;
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
142 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
143 struct coding_system
*coding
;
144 unsigned char *source
, *destination
;
145 int src_bytes
, dst_bytes
;
152 /*** COMMONLY USED MACROS ***/
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
160 #define ONE_MORE_BYTE(c1) \
165 goto label_end_of_loop; \
168 #define TWO_MORE_BYTES(c1, c2) \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
173 goto label_end_of_loop; \
176 #define THREE_MORE_BYTES(c1, c2, c3) \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
181 goto label_end_of_loop; \
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
191 /* Decode one ASCII character C. */
193 #define DECODE_CHARACTER_ASCII(c) \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
226 /*** 1. Preamble ***/
240 #else /* not emacs */
244 #endif /* not emacs */
246 Lisp_Object Qcoding_system
, Qeol_type
;
247 Lisp_Object Qbuffer_file_coding_system
;
248 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
250 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
251 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
252 Lisp_Object Qstart_process
, Qopen_network_stream
;
253 Lisp_Object Qtarget_idx
;
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
257 /* Mnemonic character to indicate format of end-of-line is not yet
259 int eol_mnemonic_undecided
;
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
267 Lisp_Object Qcoding_system_spec
, Qcoding_system_p
, Qcoding_system_error
;
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule
;
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read
;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write
;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used
;
281 /* A vector of length 256 which contains information about special
283 Lisp_Object Vmicrosoft_code_table
;
285 /* Flag to inhibit code conversion of end-of-line format. */
286 int inhibit_eol_conversion
;
288 /* Coding system to be used to encode text for terminal display. */
289 struct coding_system terminal_coding
;
291 /* Coding system to be used to encode text for terminal display when
292 terminal coding system is nil. */
293 struct coding_system safe_terminal_coding
;
295 /* Coding system of what is sent from terminal keyboard. */
296 struct coding_system keyboard_coding
;
298 Lisp_Object Vfile_coding_system_alist
;
299 Lisp_Object Vprocess_coding_system_alist
;
300 Lisp_Object Vnetwork_coding_system_alist
;
304 Lisp_Object Qcoding_category_index
;
306 /* List of symbols `coding-category-xxx' ordered by priority. */
307 Lisp_Object Vcoding_category_list
;
309 /* Table of coding-systems currently assigned to each coding-category. */
310 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
312 /* Table of names of symbol for each coding-category. */
313 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
314 "coding-category-emacs-mule",
315 "coding-category-sjis",
316 "coding-category-iso-7",
317 "coding-category-iso-8-1",
318 "coding-category-iso-8-2",
319 "coding-category-iso-7-else",
320 "coding-category-iso-8-else",
321 "coding-category-big5",
322 "coding-category-binary"
325 /* Flag to tell if we look up unification table on character code
327 Lisp_Object Venable_character_unification
;
328 /* Standard unification table to look up on decoding (reading). */
329 Lisp_Object Vstandard_character_unification_table_for_decode
;
330 /* Standard unification table to look up on encoding (writing). */
331 Lisp_Object Vstandard_character_unification_table_for_encode
;
333 Lisp_Object Qcharacter_unification_table
;
334 Lisp_Object Qcharacter_unification_table_for_decode
;
335 Lisp_Object Qcharacter_unification_table_for_encode
;
337 /* Alist of charsets vs revision number. */
338 Lisp_Object Vcharset_revision_alist
;
340 /* Default coding systems used for process I/O. */
341 Lisp_Object Vdefault_process_coding_system
;
344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
346 /* Emacs' internal format for encoding multiple character sets is a
347 kind of multi-byte encoding, i.e. characters are encoded by
348 variable-length sequences of one-byte codes. ASCII characters
349 and control characters (e.g. `tab', `newline') are represented by
350 one-byte sequences which are their ASCII codes, in the range 0x00
351 through 0x7F. The other characters are represented by a sequence
352 of `base leading-code', optional `extended leading-code', and one
353 or two `position-code's. The length of the sequence is determined
354 by the base leading-code. Leading-code takes the range 0x80
355 through 0x9F, whereas extended leading-code and position-code take
356 the range 0xA0 through 0xFF. See `charset.h' for more details
357 about leading-code and position-code.
359 There's one exception to this rule. Special leading-code
360 `leading-code-composition' denotes that the following several
361 characters should be composed into one character. Leading-codes of
362 components (except for ASCII) are added 0x20. An ASCII character
363 component is represented by a 2-byte sequence of `0xA0' and
364 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
365 details of composite character. Hence, we can summarize the code
368 --- CODE RANGE of Emacs' internal format ---
369 (character set) (range)
371 ELSE (1st byte) 0x80 .. 0x9F
372 (rest bytes) 0xA0 .. 0xFF
373 ---------------------------------------------
377 enum emacs_code_class_type emacs_code_class
[256];
379 /* Go to the next statement only if *SRC is accessible and the code is
380 greater than 0xA0. */
381 #define CHECK_CODE_RANGE_A0_FF \
383 if (src >= src_end) \
384 goto label_end_of_switch; \
385 else if (*src++ < 0xA0) \
389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
390 Check if a text is encoded in Emacs' internal format. If it is,
391 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
394 detect_coding_emacs_mule (src
, src_end
)
395 unsigned char *src
, *src_end
;
400 while (src
< src_end
)
412 switch (emacs_code_class
[c
])
414 case EMACS_ascii_code
:
415 case EMACS_linefeed_code
:
418 case EMACS_control_code
:
419 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
423 case EMACS_invalid_code
:
426 case EMACS_leading_code_composition
: /* c == 0x80 */
428 CHECK_CODE_RANGE_A0_FF
;
433 case EMACS_leading_code_4
:
434 CHECK_CODE_RANGE_A0_FF
;
435 /* fall down to check it two more times ... */
437 case EMACS_leading_code_3
:
438 CHECK_CODE_RANGE_A0_FF
;
439 /* fall down to check it one more time ... */
441 case EMACS_leading_code_2
:
442 CHECK_CODE_RANGE_A0_FF
;
450 return CODING_CATEGORY_MASK_EMACS_MULE
;
454 /*** 3. ISO2022 handlers ***/
456 /* The following note describes the coding system ISO2022 briefly.
457 Since the intention of this note is to help in understanding of
458 the programs in this file, some parts are NOT ACCURATE or OVERLY
459 SIMPLIFIED. For the thorough understanding, please refer to the
460 original document of ISO2022.
462 ISO2022 provides many mechanisms to encode several character sets
463 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
464 all text is encoded by codes of less than 128. This may make the
465 encoded text a little bit longer, but the text gets more stability
466 to pass through several gateways (some of them strip off the MSB).
468 There are two kinds of character set: control character set and
469 graphic character set. The former contains control characters such
470 as `newline' and `escape' to provide control functions (control
471 functions are provided also by escape sequences). The latter
472 contains graphic characters such as ' A' and '-'. Emacs recognizes
473 two control character sets and many graphic character sets.
475 Graphic character sets are classified into one of the following
476 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
477 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
478 bytes (DIMENSION) and the number of characters in one dimension
479 (CHARS) of the set. In addition, each character set is assigned an
480 identification tag (called "final character" and denoted as <F>
481 here after) which is unique in each class. <F> of each character
482 set is decided by ECMA(*) when it is registered in ISO. Code range
483 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
485 Note (*): ECMA = European Computer Manufacturers Association
487 Here are examples of graphic character set [NAME(<F>)]:
488 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
489 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
490 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
491 o DIMENSION2_CHARS96 -- none for the moment
493 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
494 C0 [0x00..0x1F] -- control character plane 0
495 GL [0x20..0x7F] -- graphic character plane 0
496 C1 [0x80..0x9F] -- control character plane 1
497 GR [0xA0..0xFF] -- graphic character plane 1
499 A control character set is directly designated and invoked to C0 or
500 C1 by an escape sequence. The most common case is that ISO646's
501 control character set is designated/invoked to C0 and ISO6429's
502 control character set is designated/invoked to C1, and usually
503 these designations/invocations are omitted in a coded text. With
504 7-bit environment, only C0 can be used, and a control character for
505 C1 is encoded by an appropriate escape sequence to fit in the
506 environment. All control characters for C1 are defined the
507 corresponding escape sequences.
509 A graphic character set is at first designated to one of four
510 graphic registers (G0 through G3), then these graphic registers are
511 invoked to GL or GR. These designations and invocations can be
512 done independently. The most common case is that G0 is invoked to
513 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
514 these invocations and designations are omitted in a coded text.
515 With 7-bit environment, only GL can be used.
517 When a graphic character set of CHARS94 is invoked to GL, code 0x20
518 and 0x7F of GL area work as control characters SPACE and DEL
519 respectively, and code 0xA0 and 0xFF of GR area should not be used.
521 There are two ways of invocation: locking-shift and single-shift.
522 With locking-shift, the invocation lasts until the next different
523 invocation, whereas with single-shift, the invocation works only
524 for the following character and doesn't affect locking-shift.
525 Invocations are done by the following control characters or escape
528 ----------------------------------------------------------------------
529 function control char escape sequence description
530 ----------------------------------------------------------------------
531 SI (shift-in) 0x0F none invoke G0 to GL
532 SO (shift-out) 0x0E none invoke G1 to GL
533 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
534 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
535 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
536 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
537 ----------------------------------------------------------------------
538 The first four are for locking-shift. Control characters for these
539 functions are defined by macros ISO_CODE_XXX in `coding.h'.
541 Designations are done by the following escape sequences.
542 ----------------------------------------------------------------------
543 escape sequence description
544 ----------------------------------------------------------------------
545 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
546 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
547 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
548 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
549 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
550 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
551 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
552 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
553 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
554 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
555 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
556 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
557 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
558 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
559 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
560 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
561 ----------------------------------------------------------------------
563 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
564 of dimension 1, chars 94, and final character <F>, and etc.
566 Note (*): Although these designations are not allowed in ISO2022,
567 Emacs accepts them on decoding, and produces them on encoding
568 CHARS96 character set in a coding system which is characterized as
569 7-bit environment, non-locking-shift, and non-single-shift.
571 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
572 '(' can be omitted. We call this as "short-form" here after.
574 Now you may notice that there are a lot of ways for encoding the
575 same multilingual text in ISO2022. Actually, there exists many
576 coding systems such as Compound Text (used in X's inter client
577 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
578 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
579 localized platforms), and all of these are variants of ISO2022.
581 In addition to the above, Emacs handles two more kinds of escape
582 sequences: ISO6429's direction specification and Emacs' private
583 sequence for specifying character composition.
585 ISO6429's direction specification takes the following format:
586 o CSI ']' -- end of the current direction
587 o CSI '0' ']' -- end of the current direction
588 o CSI '1' ']' -- start of left-to-right text
589 o CSI '2' ']' -- start of right-to-left text
590 The control character CSI (0x9B: control sequence introducer) is
591 abbreviated to the escape sequence ESC '[' in 7-bit environment.
593 Character composition specification takes the following format:
594 o ESC '0' -- start character composition
595 o ESC '1' -- end character composition
596 Since these are not standard escape sequences of any ISO, the use
597 of them for these meaning is restricted to Emacs only. */
599 enum iso_code_class_type iso_code_class
[256];
601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
602 Check if a text is encoded in ISO2022. If it is, returns an
603 integer in which appropriate flag bits any of:
604 CODING_CATEGORY_MASK_ISO_7
605 CODING_CATEGORY_MASK_ISO_8_1
606 CODING_CATEGORY_MASK_ISO_8_2
607 CODING_CATEGORY_MASK_ISO_7_ELSE
608 CODING_CATEGORY_MASK_ISO_8_ELSE
609 are set. If a code which should never appear in ISO2022 is found,
613 detect_coding_iso2022 (src
, src_end
)
614 unsigned char *src
, *src_end
;
616 int mask
= (CODING_CATEGORY_MASK_ISO_7
617 | CODING_CATEGORY_MASK_ISO_8_1
618 | CODING_CATEGORY_MASK_ISO_8_2
619 | CODING_CATEGORY_MASK_ISO_7_ELSE
620 | CODING_CATEGORY_MASK_ISO_8_ELSE
622 int g1
= 0; /* 1 iff designating to G1. */
625 while (src
< src_end
)
634 if ((c
>= '(' && c
<= '/'))
636 /* Designation sequence for a charset of dimension 1. */
640 if (c
< ' ' || c
>= 0x80)
641 /* Invalid designation sequence. */
646 /* Designation sequence for a charset of dimension 2. */
650 if (c
>= '@' && c
<= 'B')
651 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
653 else if (c
>= '(' && c
<= '/')
658 if (c
< ' ' || c
>= 0x80)
659 /* Invalid designation sequence. */
663 /* Invalid designation sequence. */
666 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
668 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
669 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
670 else if (c
== '0' || c
== '1' || c
== '2')
671 /* Start/end composition. */
674 /* Invalid escape sequence. */
679 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
680 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
686 return CODING_CATEGORY_MASK_ISO_8_ELSE
;
693 if (VECTORP (Vmicrosoft_code_table
)
694 && !NILP (XVECTOR (Vmicrosoft_code_table
)->contents
[c
]))
696 mask
&= ~(CODING_CATEGORY_MASK_ISO_7
697 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
704 unsigned char *src_begin
= src
;
706 mask
&= ~(CODING_CATEGORY_MASK_ISO_7
707 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
708 while (src
< src_end
&& *src
>= 0xA0)
710 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
711 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
720 /* Decode a character of which charset is CHARSET and the 1st position
721 code is C1. If dimension of CHARSET is 2, the 2nd position code is
722 fetched from SRC and set to C2. If CHARSET is negative, it means
723 that we are decoding ill formed text, and what we can do is just to
726 #define DECODE_ISO_CHARACTER(charset, c1) \
728 int c_alt, charset_alt = (charset); \
729 if (COMPOSING_HEAD_P (coding->composing)) \
731 *dst++ = LEADING_CODE_COMPOSITION; \
732 if (COMPOSING_WITH_RULE_P (coding->composing)) \
733 /* To tell composition rules are embeded. */ \
735 coding->composing += 2; \
737 if ((charset) >= 0) \
739 if (CHARSET_DIMENSION (charset) == 2) \
740 ONE_MORE_BYTE (c2); \
741 if (!NILP (unification_table) \
742 && ((c_alt = unify_char (unification_table, \
743 -1, (charset), c1, c2)) >= 0)) \
744 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
746 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
747 DECODE_CHARACTER_ASCII (c1); \
748 else if (CHARSET_DIMENSION (charset_alt) == 1) \
749 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
751 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
752 if (COMPOSING_WITH_RULE_P (coding->composing)) \
753 /* To tell a composition rule follows. */ \
754 coding->composing = COMPOSING_WITH_RULE_RULE; \
757 /* Set designation state into CODING. */
758 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
760 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
761 make_number (chars), \
762 make_number (final_char)); \
765 if (coding->direction == 1 \
766 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
767 charset = CHARSET_REVERSE_CHARSET (charset); \
768 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
775 decode_coding_iso2022 (coding
, source
, destination
,
776 src_bytes
, dst_bytes
, consumed
)
777 struct coding_system
*coding
;
778 unsigned char *source
, *destination
;
779 int src_bytes
, dst_bytes
;
782 unsigned char *src
= source
;
783 unsigned char *src_end
= source
+ src_bytes
;
784 unsigned char *dst
= destination
;
785 unsigned char *dst_end
= destination
+ dst_bytes
;
786 /* Since the maximum bytes produced by each loop is 7, we subtract 6
787 from DST_END to assure that overflow checking is necessary only
788 at the head of loop. */
789 unsigned char *adjusted_dst_end
= dst_end
- 6;
791 /* Charsets invoked to graphic plane 0 and 1 respectively. */
792 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
793 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
794 Lisp_Object unification_table
795 = coding
->character_unification_table_for_decode
;
797 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
798 unification_table
= Vstandard_character_unification_table_for_decode
;
800 while (src
< src_end
&& dst
< adjusted_dst_end
)
802 /* SRC_BASE remembers the start position in source in each loop.
803 The loop will be exited when there's not enough source text
804 to analyze long escape sequence or 2-byte code (within macros
805 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
806 to SRC_BASE before exiting. */
807 unsigned char *src_base
= src
;
810 switch (iso_code_class
[c1
])
812 case ISO_0x20_or_0x7F
:
813 if (!coding
->composing
814 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
816 /* This is SPACE or DEL. */
820 /* This is a graphic character, we fall down ... */
822 case ISO_graphic_plane_0
:
823 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
825 /* This is a composition rule. */
827 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
830 DECODE_ISO_CHARACTER (charset0
, c1
);
833 case ISO_0xA0_or_0xFF
:
834 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
840 /* This is a graphic character, we fall down ... */
842 case ISO_graphic_plane_1
:
843 DECODE_ISO_CHARACTER (charset1
, c1
);
846 case ISO_control_code
:
847 /* All ISO2022 control characters in this class have the
848 same representation in Emacs internal format. */
852 case ISO_carriage_return
:
853 if (coding
->eol_type
== CODING_EOL_CR
)
857 else if (coding
->eol_type
== CODING_EOL_CRLF
)
860 if (c1
== ISO_CODE_LF
)
875 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
876 goto label_invalid_escape_sequence
;
877 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
878 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
882 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
883 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
886 case ISO_single_shift_2_7
:
887 case ISO_single_shift_2
:
888 /* SS2 is handled as an escape sequence of ESC 'N' */
890 goto label_escape_sequence
;
892 case ISO_single_shift_3
:
893 /* SS2 is handled as an escape sequence of ESC 'O' */
895 goto label_escape_sequence
;
897 case ISO_control_sequence_introducer
:
898 /* CSI is handled as an escape sequence of ESC '[' ... */
900 goto label_escape_sequence
;
904 label_escape_sequence
:
905 /* Escape sequences handled by Emacs are invocation,
906 designation, direction specification, and character
907 composition specification. */
910 case '&': /* revision of following character set */
912 if (!(c1
>= '@' && c1
<= '~'))
913 goto label_invalid_escape_sequence
;
915 if (c1
!= ISO_CODE_ESC
)
916 goto label_invalid_escape_sequence
;
918 goto label_escape_sequence
;
920 case '$': /* designation of 2-byte character set */
922 if (c1
>= '@' && c1
<= 'B')
923 { /* designation of JISX0208.1978, GB2312.1980,
925 DECODE_DESIGNATION (0, 2, 94, c1
);
927 else if (c1
>= 0x28 && c1
<= 0x2B)
928 { /* designation of DIMENSION2_CHARS94 character set */
930 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
932 else if (c1
>= 0x2C && c1
<= 0x2F)
933 { /* designation of DIMENSION2_CHARS96 character set */
935 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
938 goto label_invalid_escape_sequence
;
941 case 'n': /* invocation of locking-shift-2 */
942 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
943 goto label_invalid_escape_sequence
;
944 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
945 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
948 case 'o': /* invocation of locking-shift-3 */
949 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
950 goto label_invalid_escape_sequence
;
951 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
952 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
955 case 'N': /* invocation of single-shift-2 */
956 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
957 goto label_invalid_escape_sequence
;
959 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
960 DECODE_ISO_CHARACTER (charset
, c1
);
963 case 'O': /* invocation of single-shift-3 */
964 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
965 goto label_invalid_escape_sequence
;
967 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
968 DECODE_ISO_CHARACTER (charset
, c1
);
971 case '0': /* start composing without embeded rules */
972 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
975 case '1': /* end composing */
976 coding
->composing
= COMPOSING_NO
;
979 case '2': /* start composing with embeded rules */
980 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
983 case '[': /* specification of direction */
984 /* For the moment, nested direction is not supported.
985 So, the value of `coding->direction' is 0 or 1: 0
986 means left-to-right, 1 means right-to-left. */
990 case ']': /* end of the current direction */
991 coding
->direction
= 0;
993 case '0': /* end of the current direction */
994 case '1': /* start of left-to-right direction */
997 coding
->direction
= 0;
999 goto label_invalid_escape_sequence
;
1002 case '2': /* start of right-to-left direction */
1005 coding
->direction
= 1;
1007 goto label_invalid_escape_sequence
;
1011 goto label_invalid_escape_sequence
;
1016 if (c1
>= 0x28 && c1
<= 0x2B)
1017 { /* designation of DIMENSION1_CHARS94 character set */
1019 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1021 else if (c1
>= 0x2C && c1
<= 0x2F)
1022 { /* designation of DIMENSION1_CHARS96 character set */
1024 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1028 goto label_invalid_escape_sequence
;
1031 /* We must update these variables now. */
1032 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1033 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1036 label_invalid_escape_sequence
:
1038 int length
= src
- src_base
;
1040 bcopy (src_base
, dst
, length
);
1047 coding
->carryover_size
= src
- src_base
;
1048 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1053 /* If this is the last block of the text to be decoded, we had
1054 better just flush out all remaining codes in the text although
1055 they are not valid characters. */
1056 if (coding
->last_block
)
1058 bcopy (src
, dst
, src_end
- src
);
1059 dst
+= (src_end
- src
);
1062 *consumed
= src
- source
;
1063 return dst
- destination
;
1066 /* ISO2022 encoding stuff. */
1069 It is not enough to say just "ISO2022" on encoding, we have to
1070 specify more details. In Emacs, each coding-system of ISO2022
1071 variant has the following specifications:
1072 1. Initial designation to G0 thru G3.
1073 2. Allows short-form designation?
1074 3. ASCII should be designated to G0 before control characters?
1075 4. ASCII should be designated to G0 at end of line?
1076 5. 7-bit environment or 8-bit environment?
1077 6. Use locking-shift?
1078 7. Use Single-shift?
1079 And the following two are only for Japanese:
1080 8. Use ASCII in place of JIS0201-1976-Roman?
1081 9. Use JISX0208-1983 in place of JISX0208-1978?
1082 These specifications are encoded in `coding->flags' as flag bits
1083 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1087 /* Produce codes (escape sequence) for designating CHARSET to graphic
1088 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1089 the coding system CODING allows, produce designation sequence of
1092 #define ENCODE_DESIGNATION(charset, reg, coding) \
1094 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1095 char *intermediate_char_94 = "()*+"; \
1096 char *intermediate_char_96 = ",-./"; \
1098 = Fassq (make_number (charset), Vcharset_revision_alist); \
1099 if (! NILP (temp)) \
1101 *dst++ = ISO_CODE_ESC; \
1103 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1105 *dst++ = ISO_CODE_ESC; \
1106 if (CHARSET_DIMENSION (charset) == 1) \
1108 if (CHARSET_CHARS (charset) == 94) \
1109 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1111 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1116 if (CHARSET_CHARS (charset) == 94) \
1118 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1120 || final_char < '@' || final_char > 'B') \
1121 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1124 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1126 *dst++ = final_char; \
1127 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1130 /* The following two macros produce codes (control character or escape
1131 sequence) for ISO2022 single-shift functions (single-shift-2 and
1134 #define ENCODE_SINGLE_SHIFT_2 \
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1137 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1139 *dst++ = ISO_CODE_SS2; \
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1143 #define ENCODE_SINGLE_SHIFT_3 \
1145 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1146 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1148 *dst++ = ISO_CODE_SS3; \
1149 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1152 /* The following four macros produce codes (control character or
1153 escape sequence) for ISO2022 locking-shift functions (shift-in,
1154 shift-out, locking-shift-2, and locking-shift-3). */
1156 #define ENCODE_SHIFT_IN \
1158 *dst++ = ISO_CODE_SI; \
1159 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1162 #define ENCODE_SHIFT_OUT \
1164 *dst++ = ISO_CODE_SO; \
1165 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1168 #define ENCODE_LOCKING_SHIFT_2 \
1170 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1171 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1174 #define ENCODE_LOCKING_SHIFT_3 \
1176 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1177 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1180 /* Produce codes for a DIMENSION1 character whose character set is
1181 CHARSET and whose position-code is C1. Designation and invocation
1182 sequences are also produced in advance if necessary. */
1185 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1187 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1189 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1190 *dst++ = c1 & 0x7F; \
1192 *dst++ = c1 | 0x80; \
1193 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1196 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1198 *dst++ = c1 & 0x7F; \
1201 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1203 *dst++ = c1 | 0x80; \
1206 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1207 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1209 /* We should not encode this character, instead produce one or \
1211 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1212 if (CHARSET_WIDTH (charset) == 2) \
1213 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1217 /* Since CHARSET is not yet invoked to any graphic planes, we \
1218 must invoke it, or, at first, designate it to some graphic \
1219 register. Then repeat the loop to actually produce the \
1221 dst = encode_invocation_designation (charset, coding, dst); \
1224 /* Produce codes for a DIMENSION2 character whose character set is
1225 CHARSET and whose position-codes are C1 and C2. Designation and
1226 invocation codes are also produced in advance if necessary. */
1228 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1230 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1232 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1233 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1235 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1236 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1239 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1241 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1244 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1246 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1249 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1250 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1252 /* We should not encode this character, instead produce one or \
1254 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1255 if (CHARSET_WIDTH (charset) == 2) \
1256 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1260 /* Since CHARSET is not yet invoked to any graphic planes, we \
1261 must invoke it, or, at first, designate it to some graphic \
1262 register. Then repeat the loop to actually produce the \
1264 dst = encode_invocation_designation (charset, coding, dst); \
1267 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1269 int c_alt, charset_alt; \
1270 if (!NILP (unification_table) \
1271 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1273 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1275 charset_alt = charset; \
1276 if (CHARSET_DIMENSION (charset_alt) == 1) \
1277 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1279 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1282 /* Produce designation and invocation codes at a place pointed by DST
1283 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1287 encode_invocation_designation (charset
, coding
, dst
)
1289 struct coding_system
*coding
;
1292 int reg
; /* graphic register number */
1294 /* At first, check designations. */
1295 for (reg
= 0; reg
< 4; reg
++)
1296 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1301 /* CHARSET is not yet designated to any graphic registers. */
1302 /* At first check the requested designation. */
1303 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1304 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1305 /* Since CHARSET requests no special designation, designate it
1306 to graphic register 0. */
1309 ENCODE_DESIGNATION (charset
, reg
, coding
);
1312 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1313 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1315 /* Since the graphic register REG is not invoked to any graphic
1316 planes, invoke it to graphic plane 0. */
1319 case 0: /* graphic register 0 */
1323 case 1: /* graphic register 1 */
1327 case 2: /* graphic register 2 */
1328 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1329 ENCODE_SINGLE_SHIFT_2
;
1331 ENCODE_LOCKING_SHIFT_2
;
1334 case 3: /* graphic register 3 */
1335 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1336 ENCODE_SINGLE_SHIFT_3
;
1338 ENCODE_LOCKING_SHIFT_3
;
1345 /* The following two macros produce codes for indicating composition. */
1346 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1347 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1348 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1350 /* The following three macros produce codes for indicating direction
1352 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1354 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1355 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1357 *dst++ = ISO_CODE_CSI; \
1360 #define ENCODE_DIRECTION_R2L \
1361 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1363 #define ENCODE_DIRECTION_L2R \
1364 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1366 /* Produce codes for designation and invocation to reset the graphic
1367 planes and registers to initial state. */
1368 #define ENCODE_RESET_PLANE_AND_REGISTER \
1371 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1373 for (reg = 0; reg < 4; reg++) \
1374 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1375 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1376 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1377 ENCODE_DESIGNATION \
1378 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1381 /* Produce designation sequences of charsets in the line started from
1382 *SRC to a place pointed by DSTP.
1384 If the current block ends before any end-of-line, we may fail to
1385 find all the necessary *designations. */
1386 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1387 struct coding_system
*coding
;
1389 unsigned char *src
, *src_end
, **dstp
;
1391 int charset
, c
, found
= 0, reg
;
1392 /* Table of charsets to be designated to each graphic register. */
1394 unsigned char *dst
= *dstp
;
1396 for (reg
= 0; reg
< 4; reg
++)
1399 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1401 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1404 charset
= CHARSET_AT (src
);
1409 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1410 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1411 charset
= CHAR_CHARSET (c_alt
);
1414 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1415 if (r
[reg
] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1426 for (reg
= 0; reg
< 4; reg
++)
1428 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1429 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1434 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1437 encode_coding_iso2022 (coding
, source
, destination
,
1438 src_bytes
, dst_bytes
, consumed
)
1439 struct coding_system
*coding
;
1440 unsigned char *source
, *destination
;
1441 int src_bytes
, dst_bytes
;
1444 unsigned char *src
= source
;
1445 unsigned char *src_end
= source
+ src_bytes
;
1446 unsigned char *dst
= destination
;
1447 unsigned char *dst_end
= destination
+ dst_bytes
;
1448 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1449 from DST_END to assure overflow checking is necessary only at the
1451 unsigned char *adjusted_dst_end
= dst_end
- 19;
1452 Lisp_Object unification_table
1453 = coding
->character_unification_table_for_encode
;
1455 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1456 unification_table
= Vstandard_character_unification_table_for_encode
;
1458 while (src
< src_end
&& dst
< adjusted_dst_end
)
1460 /* SRC_BASE remembers the start position in source in each loop.
1461 The loop will be exited when there's not enough source text
1462 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1463 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1464 reset to SRC_BASE before exiting. */
1465 unsigned char *src_base
= src
;
1466 int charset
, c1
, c2
, c3
, c4
;
1468 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1469 && CODING_SPEC_ISO_BOL (coding
))
1471 /* We have to produce designation sequences if any now. */
1472 encode_designation_at_bol (coding
, unification_table
,
1473 src
, src_end
, &dst
);
1474 CODING_SPEC_ISO_BOL (coding
) = 0;
1478 /* If we are seeing a component of a composite character, we are
1479 seeing a leading-code specially encoded for composition, or a
1480 composition rule if composing with rule. We must set C1
1481 to a normal leading-code or an ASCII code. If we are not at
1482 a composed character, we must reset the composition state. */
1483 if (COMPOSING_P (coding
->composing
))
1487 /* We are not in a composite character any longer. */
1488 coding
->composing
= COMPOSING_NO
;
1489 ENCODE_COMPOSITION_END
;
1493 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1496 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1499 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1500 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1503 /* This is an ASCII component. */
1508 /* This is a leading-code of non ASCII component. */
1513 /* Now encode one character. C1 is a control character, an
1514 ASCII character, or a leading-code of multi-byte character. */
1515 switch (emacs_code_class
[c1
])
1517 case EMACS_ascii_code
:
1518 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1521 case EMACS_control_code
:
1522 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1523 ENCODE_RESET_PLANE_AND_REGISTER
;
1527 case EMACS_carriage_return_code
:
1528 if (!coding
->selective
)
1530 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1531 ENCODE_RESET_PLANE_AND_REGISTER
;
1535 /* fall down to treat '\r' as '\n' ... */
1537 case EMACS_linefeed_code
:
1538 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1539 ENCODE_RESET_PLANE_AND_REGISTER
;
1540 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1541 bcopy (coding
->spec
.iso2022
.initial_designation
,
1542 coding
->spec
.iso2022
.current_designation
,
1543 sizeof coding
->spec
.iso2022
.initial_designation
);
1544 if (coding
->eol_type
== CODING_EOL_LF
1545 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1546 *dst
++ = ISO_CODE_LF
;
1547 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1548 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1550 *dst
++ = ISO_CODE_CR
;
1551 CODING_SPEC_ISO_BOL (coding
) = 1;
1554 case EMACS_leading_code_2
:
1558 /* invalid sequence */
1563 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1566 case EMACS_leading_code_3
:
1567 TWO_MORE_BYTES (c2
, c3
);
1568 if (c2
< 0xA0 || c3
< 0xA0)
1570 /* invalid sequence */
1575 else if (c1
< LEADING_CODE_PRIVATE_11
)
1576 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1578 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1581 case EMACS_leading_code_4
:
1582 THREE_MORE_BYTES (c2
, c3
, c4
);
1583 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1585 /* invalid sequence */
1592 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1595 case EMACS_leading_code_composition
:
1599 /* invalid sequence */
1603 else if (c2
== 0xFF)
1605 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1606 ENCODE_COMPOSITION_WITH_RULE_START
;
1610 /* Rewind one byte because it is a character code of
1611 composition elements. */
1613 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1614 ENCODE_COMPOSITION_NO_RULE_START
;
1618 case EMACS_invalid_code
:
1624 /* We reach here because the source date ends not at character
1626 coding
->carryover_size
= src_end
- src_base
;
1627 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1632 /* If this is the last block of the text to be encoded, we must
1633 reset graphic planes and registers to the initial state. */
1634 if (src
>= src_end
&& coding
->last_block
)
1636 ENCODE_RESET_PLANE_AND_REGISTER
;
1637 if (coding
->carryover_size
> 0
1638 && coding
->carryover_size
< (dst_end
- dst
))
1640 bcopy (coding
->carryover
, dst
, coding
->carryover_size
);
1641 dst
+= coding
->carryover_size
;
1642 coding
->carryover_size
= 0;
1645 *consumed
= src
- source
;
1646 return dst
- destination
;
1650 /*** 4. SJIS and BIG5 handlers ***/
1652 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1653 quite widely. So, for the moment, Emacs supports them in the bare
1654 C code. But, in the future, they may be supported only by CCL. */
1656 /* SJIS is a coding system encoding three character sets: ASCII, right
1657 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1658 as is. A character of charset katakana-jisx0201 is encoded by
1659 "position-code + 0x80". A character of charset japanese-jisx0208
1660 is encoded in 2-byte but two position-codes are divided and shifted
1661 so that it fit in the range below.
1663 --- CODE RANGE of SJIS ---
1664 (character set) (range)
1666 KATAKANA-JISX0201 0xA0 .. 0xDF
1667 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1668 (2nd byte) 0x40 .. 0xFF
1669 -------------------------------
1673 /* BIG5 is a coding system encoding two character sets: ASCII and
1674 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1675 character set and is encoded in two-byte.
1677 --- CODE RANGE of BIG5 ---
1678 (character set) (range)
1680 Big5 (1st byte) 0xA1 .. 0xFE
1681 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1682 --------------------------
1684 Since the number of characters in Big5 is larger than maximum
1685 characters in Emacs' charset (96x96), it can't be handled as one
1686 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1687 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1688 contains frequently used characters and the latter contains less
1689 frequently used characters. */
1691 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1692 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1693 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1694 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1696 /* Number of Big5 characters which have the same code in 1st byte. */
1697 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1699 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1702 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1704 charset = charset_big5_1; \
1707 charset = charset_big5_2; \
1708 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1710 c1 = temp / (0xFF - 0xA1) + 0x21; \
1711 c2 = temp % (0xFF - 0xA1) + 0x21; \
1714 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1716 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1717 if (charset == charset_big5_2) \
1718 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1719 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1720 b2 = temp % BIG5_SAME_ROW; \
1721 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1724 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1726 int c_alt, charset_alt = (charset); \
1727 if (!NILP (unification_table) \
1728 && ((c_alt = unify_char (unification_table, \
1729 -1, (charset), c1, c2)) >= 0)) \
1730 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1731 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1732 DECODE_CHARACTER_ASCII (c1); \
1733 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1734 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1736 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1739 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1741 int c_alt, charset_alt; \
1742 if (!NILP (unification_table) \
1743 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1745 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1747 charset_alt = charset; \
1748 if (charset_alt == charset_ascii) \
1750 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1752 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1755 *dst++ = charset_alt, *dst++ = c1; \
1759 c1 &= 0x7F, c2 &= 0x7F; \
1760 if (sjis_p && charset_alt == charset_jisx0208) \
1762 unsigned char s1, s2; \
1764 ENCODE_SJIS (c1, c2, s1, s2); \
1765 *dst++ = s1, *dst++ = s2; \
1768 && (charset_alt == charset_big5_1 \
1769 || charset_alt == charset_big5_2)) \
1771 unsigned char b1, b2; \
1773 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1774 *dst++ = b1, *dst++ = b2; \
1777 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1781 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1782 Check if a text is encoded in SJIS. If it is, return
1783 CODING_CATEGORY_MASK_SJIS, else return 0. */
1786 detect_coding_sjis (src
, src_end
)
1787 unsigned char *src
, *src_end
;
1791 while (src
< src_end
)
1794 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1796 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1798 if (src
< src_end
&& *src
++ < 0x40)
1802 return CODING_CATEGORY_MASK_SJIS
;
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806 Check if a text is encoded in BIG5. If it is, return
1807 CODING_CATEGORY_MASK_BIG5, else return 0. */
1810 detect_coding_big5 (src
, src_end
)
1811 unsigned char *src
, *src_end
;
1815 while (src
< src_end
)
1818 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1825 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1829 return CODING_CATEGORY_MASK_BIG5
;
1832 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1833 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1836 decode_coding_sjis_big5 (coding
, source
, destination
,
1837 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1838 struct coding_system
*coding
;
1839 unsigned char *source
, *destination
;
1840 int src_bytes
, dst_bytes
;
1844 unsigned char *src
= source
;
1845 unsigned char *src_end
= source
+ src_bytes
;
1846 unsigned char *dst
= destination
;
1847 unsigned char *dst_end
= destination
+ dst_bytes
;
1848 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1849 from DST_END to assure overflow checking is necessary only at the
1851 unsigned char *adjusted_dst_end
= dst_end
- 3;
1852 Lisp_Object unification_table
1853 = coding
->character_unification_table_for_decode
;
1855 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1856 unification_table
= Vstandard_character_unification_table_for_decode
;
1858 while (src
< src_end
&& dst
< adjusted_dst_end
)
1860 /* SRC_BASE remembers the start position in source in each loop.
1861 The loop will be exited when there's not enough source text
1862 to analyze two-byte character (within macro ONE_MORE_BYTE).
1863 In that case, SRC is reset to SRC_BASE before exiting. */
1864 unsigned char *src_base
= src
;
1865 unsigned char c1
= *src
++, c2
, c3
, c4
;
1869 if (coding
->eol_type
== CODING_EOL_CRLF
)
1875 /* To process C2 again, SRC is subtracted by 1. */
1884 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1885 else if (c1
< 0xA0 || c1
>= 0xE0)
1887 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1891 DECODE_SJIS (c1
, c2
, c3
, c4
);
1892 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
1894 else if (c1
>= 0xE0 && c1
< 0xFF)
1899 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1900 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1902 else /* Invalid code */
1907 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1909 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
, /* dummy */ c2
);
1915 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1916 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1922 coding
->carryover_size
= src
- src_base
;
1923 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1928 *consumed
= src
- source
;
1929 return dst
- destination
;
1932 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1933 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1934 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1935 sure that all these charsets are registered as official charset
1936 (i.e. do not have extended leading-codes). Characters of other
1937 charsets are produced without any encoding. If SJIS_P is 1, encode
1938 SJIS text, else encode BIG5 text. */
1941 encode_coding_sjis_big5 (coding
, source
, destination
,
1942 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1943 struct coding_system
*coding
;
1944 unsigned char *source
, *destination
;
1945 int src_bytes
, dst_bytes
;
1949 unsigned char *src
= source
;
1950 unsigned char *src_end
= source
+ src_bytes
;
1951 unsigned char *dst
= destination
;
1952 unsigned char *dst_end
= destination
+ dst_bytes
;
1953 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1954 from DST_END to assure overflow checking is necessary only at the
1956 unsigned char *adjusted_dst_end
= dst_end
- 1;
1957 Lisp_Object unification_table
1958 = coding
->character_unification_table_for_encode
;
1960 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1961 unification_table
= Vstandard_character_unification_table_for_encode
;
1963 while (src
< src_end
&& dst
< adjusted_dst_end
)
1965 /* SRC_BASE remembers the start position in source in each loop.
1966 The loop will be exited when there's not enough source text
1967 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1968 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1970 unsigned char *src_base
= src
;
1971 unsigned char c1
= *src
++, c2
, c3
, c4
;
1973 if (coding
->composing
)
1980 else if (c1
>= 0xA0)
1983 coding
->composing
= 0;
1986 switch (emacs_code_class
[c1
])
1988 case EMACS_ascii_code
:
1989 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1992 case EMACS_control_code
:
1996 case EMACS_carriage_return_code
:
1997 if (!coding
->selective
)
2002 /* fall down to treat '\r' as '\n' ... */
2004 case EMACS_linefeed_code
:
2005 if (coding
->eol_type
== CODING_EOL_LF
2006 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2008 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2009 *dst
++ = '\r', *dst
++ = '\n';
2014 case EMACS_leading_code_2
:
2016 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2019 case EMACS_leading_code_3
:
2020 TWO_MORE_BYTES (c2
, c3
);
2021 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2024 case EMACS_leading_code_4
:
2025 THREE_MORE_BYTES (c2
, c3
, c4
);
2026 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2029 case EMACS_leading_code_composition
:
2030 coding
->composing
= 1;
2033 default: /* i.e. case EMACS_invalid_code: */
2039 coding
->carryover_size
= src_end
- src_base
;
2040 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2045 *consumed
= src
- source
;
2046 return dst
- destination
;
2050 /*** 5. End-of-line handlers ***/
2052 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2053 This function is called only when `coding->eol_type' is
2054 CODING_EOL_CRLF or CODING_EOL_CR. */
2056 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2057 struct coding_system
*coding
;
2058 unsigned char *source
, *destination
;
2059 int src_bytes
, dst_bytes
;
2062 unsigned char *src
= source
;
2063 unsigned char *src_end
= source
+ src_bytes
;
2064 unsigned char *dst
= destination
;
2065 unsigned char *dst_end
= destination
+ dst_bytes
;
2068 switch (coding
->eol_type
)
2070 case CODING_EOL_CRLF
:
2072 /* Since the maximum bytes produced by each loop is 2, we
2073 subtract 1 from DST_END to assure overflow checking is
2074 necessary only at the head of loop. */
2075 unsigned char *adjusted_dst_end
= dst_end
- 1;
2077 while (src
< src_end
&& dst
< adjusted_dst_end
)
2079 unsigned char *src_base
= src
;
2080 unsigned char c
= *src
++;
2093 coding
->carryover_size
= src
- src_base
;
2094 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2098 *consumed
= src
- source
;
2099 produced
= dst
- destination
;
2104 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2105 bcopy (source
, destination
, produced
);
2106 dst_end
= destination
+ produced
;
2107 while (dst
< dst_end
)
2108 if (*dst
++ == '\r') dst
[-1] = '\n';
2109 *consumed
= produced
;
2112 default: /* i.e. case: CODING_EOL_LF */
2113 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2114 bcopy (source
, destination
, produced
);
2115 *consumed
= produced
;
2122 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2123 format of end-of-line according to `coding->eol_type'. If
2124 `coding->selective' is 1, code '\r' in source text also means
2127 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2128 struct coding_system
*coding
;
2129 unsigned char *source
, *destination
;
2130 int src_bytes
, dst_bytes
;
2133 unsigned char *src
= source
;
2134 unsigned char *dst
= destination
;
2140 switch (coding
->eol_type
)
2143 case CODING_EOL_UNDECIDED
:
2144 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2145 bcopy (source
, destination
, produced
);
2146 if (coding
->selective
)
2150 if (*dst
++ == '\r') dst
[-1] = '\n';
2152 *consumed
= produced
;
2154 case CODING_EOL_CRLF
:
2157 unsigned char *src_end
= source
+ src_bytes
;
2158 unsigned char *dst_end
= destination
+ dst_bytes
;
2159 /* Since the maximum bytes produced by each loop is 2, we
2160 subtract 1 from DST_END to assure overflow checking is
2161 necessary only at the head of loop. */
2162 unsigned char *adjusted_dst_end
= dst_end
- 1;
2164 while (src
< src_end
&& dst
< adjusted_dst_end
)
2167 if (c
== '\n' || (c
== '\r' && coding
->selective
))
2168 *dst
++ = '\r', *dst
++ = '\n';
2172 produced
= dst
- destination
;
2173 *consumed
= src
- source
;
2177 default: /* i.e. case CODING_EOL_CR: */
2178 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2179 bcopy (source
, destination
, produced
);
2183 if (*dst
++ == '\n') dst
[-1] = '\r';
2185 *consumed
= produced
;
2192 /*** 6. C library functions ***/
2194 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2195 has a property `coding-system'. The value of this property is a
2196 vector of length 5 (called as coding-vector). Among elements of
2197 this vector, the first (element[0]) and the fifth (element[4])
2198 carry important information for decoding/encoding. Before
2199 decoding/encoding, this information should be set in fields of a
2200 structure of type `coding_system'.
2202 A value of property `coding-system' can be a symbol of another
2203 subsidiary coding-system. In that case, Emacs gets coding-vector
2206 `element[0]' contains information to be set in `coding->type'. The
2207 value and its meaning is as follows:
2209 0 -- coding_type_emacs_mule
2210 1 -- coding_type_sjis
2211 2 -- coding_type_iso2022
2212 3 -- coding_type_big5
2213 4 -- coding_type_ccl encoder/decoder written in CCL
2214 nil -- coding_type_no_conversion
2215 t -- coding_type_undecided (automatic conversion on decoding,
2216 no-conversion on encoding)
2218 `element[4]' contains information to be set in `coding->flags' and
2219 `coding->spec'. The meaning varies by `coding->type'.
2221 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2222 of length 32 (of which the first 13 sub-elements are used now).
2223 Meanings of these sub-elements are:
2225 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2226 If the value is an integer of valid charset, the charset is
2227 assumed to be designated to graphic register N initially.
2229 If the value is minus, it is a minus value of charset which
2230 reserves graphic register N, which means that the charset is
2231 not designated initially but should be designated to graphic
2232 register N just before encoding a character in that charset.
2234 If the value is nil, graphic register N is never used on
2237 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2238 Each value takes t or nil. See the section ISO2022 of
2239 `coding.h' for more information.
2241 If `coding->type' is `coding_type_big5', element[4] is t to denote
2242 BIG5-ETen or nil to denote BIG5-HKU.
2244 If `coding->type' takes the other value, element[4] is ignored.
2246 Emacs Lisp's coding system also carries information about format of
2247 end-of-line in a value of property `eol-type'. If the value is
2248 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2249 means CODING_EOL_CR. If it is not integer, it should be a vector
2250 of subsidiary coding systems of which property `eol-type' has one
2255 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2256 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2257 is setup so that no conversion is necessary and return -1, else
2261 setup_coding_system (coding_system
, coding
)
2262 Lisp_Object coding_system
;
2263 struct coding_system
*coding
;
2265 Lisp_Object type
, eol_type
;
2267 /* At first, set several fields to default values. */
2268 coding
->require_flushing
= 0;
2269 coding
->last_block
= 0;
2270 coding
->selective
= 0;
2271 coding
->composing
= 0;
2272 coding
->direction
= 0;
2273 coding
->carryover_size
= 0;
2274 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2275 coding
->character_unification_table_for_decode
= Qnil
;
2276 coding
->character_unification_table_for_encode
= Qnil
;
2278 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2280 /* Get value of property `coding-system' until we get a vector.
2281 While doing that, also get values of properties
2282 `post-read-conversion', `pre-write-conversion',
2283 `character-unification-table-for-decode',
2284 `character-unification-table-for-encode' and `eol-type'. */
2285 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2287 if (NILP (coding
->post_read_conversion
))
2288 coding
->post_read_conversion
= Fget (coding_system
,
2289 Qpost_read_conversion
);
2290 if (NILP (coding
->pre_write_conversion
))
2291 coding
->pre_write_conversion
= Fget (coding_system
,
2292 Qpre_write_conversion
);
2293 if (!inhibit_eol_conversion
&& NILP (eol_type
))
2294 eol_type
= Fget (coding_system
, Qeol_type
);
2296 if (NILP (coding
->character_unification_table_for_decode
))
2297 coding
->character_unification_table_for_decode
2298 = Fget (coding_system
, Qcharacter_unification_table_for_decode
);
2300 if (NILP (coding
->character_unification_table_for_encode
))
2301 coding
->character_unification_table_for_encode
2302 = Fget (coding_system
, Qcharacter_unification_table_for_encode
);
2304 coding_system
= Fget (coding_system
, Qcoding_system
);
2307 while (!NILP (coding
->character_unification_table_for_decode
)
2308 && SYMBOLP (coding
->character_unification_table_for_decode
))
2309 coding
->character_unification_table_for_decode
2310 = Fget (coding
->character_unification_table_for_decode
,
2311 Qcharacter_unification_table_for_decode
);
2312 if (!NILP (coding
->character_unification_table_for_decode
)
2313 && !CHAR_TABLE_P (coding
->character_unification_table_for_decode
))
2314 coding
->character_unification_table_for_decode
= Qnil
;
2316 while (!NILP (coding
->character_unification_table_for_encode
)
2317 && SYMBOLP (coding
->character_unification_table_for_encode
))
2318 coding
->character_unification_table_for_encode
2319 = Fget (coding
->character_unification_table_for_encode
,
2320 Qcharacter_unification_table_for_encode
);
2321 if (!NILP (coding
->character_unification_table_for_encode
)
2322 && !CHAR_TABLE_P (coding
->character_unification_table_for_encode
))
2323 coding
->character_unification_table_for_encode
= Qnil
;
2325 if (!VECTORP (coding_system
)
2326 || XVECTOR (coding_system
)->size
!= 5)
2327 goto label_invalid_coding_system
;
2329 if (VECTORP (eol_type
))
2330 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2331 else if (XFASTINT (eol_type
) == 1)
2332 coding
->eol_type
= CODING_EOL_CRLF
;
2333 else if (XFASTINT (eol_type
) == 2)
2334 coding
->eol_type
= CODING_EOL_CR
;
2336 coding
->eol_type
= CODING_EOL_LF
;
2338 type
= XVECTOR (coding_system
)->contents
[0];
2339 switch (XFASTINT (type
))
2342 coding
->type
= coding_type_emacs_mule
;
2346 coding
->type
= coding_type_sjis
;
2350 coding
->type
= coding_type_iso2022
;
2352 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2354 int i
, charset
, default_reg_bits
= 0;
2356 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2357 goto label_invalid_coding_system
;
2359 flags
= XVECTOR (val
)->contents
;
2361 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2362 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2363 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2364 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2365 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2366 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2367 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2368 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2369 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2370 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2371 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2372 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2375 /* Invoke graphic register 0 to plane 0. */
2376 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2377 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2378 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2379 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2380 /* Not single shifting at first. */
2381 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2382 /* Beginning of buffer should also be regarded as bol. */
2383 CODING_SPEC_ISO_BOL (coding
) = 1;
2385 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2386 FLAGS[REG] can be one of below:
2387 integer CHARSET: CHARSET occupies register I,
2388 t: designate nothing to REG initially, but can be used
2390 list of integer, nil, or t: designate the first
2391 element (if integer) to REG initially, the remaining
2392 elements (if integer) is designated to REG on request,
2393 if an element is t, REG can be used by any charset,
2394 nil: REG is never used. */
2395 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2396 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2397 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2398 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
), MAX_CHARSET
+ 1);
2399 for (i
= 0; i
< 4; i
++)
2401 if (INTEGERP (flags
[i
])
2402 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2403 || (charset
= get_charset_id (flags
[i
])) >= 0)
2405 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2406 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2407 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
] = 1;
2409 else if (EQ (flags
[i
], Qt
))
2411 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2412 default_reg_bits
|= 1 << i
;
2414 else if (CONSP (flags
[i
]))
2416 Lisp_Object tail
= flags
[i
];
2418 if (INTEGERP (XCONS (tail
)->car
)
2419 && (charset
= XINT (XCONS (tail
)->car
),
2420 CHARSET_VALID_P (charset
))
2421 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2423 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2424 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2425 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
] = 1;
2428 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2429 tail
= XCONS (tail
)->cdr
;
2430 while (CONSP (tail
))
2432 if (INTEGERP (XCONS (tail
)->car
)
2433 && (charset
= XINT (XCONS (tail
)->car
),
2434 CHARSET_VALID_P (charset
))
2435 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2437 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2439 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
]
2442 else if (EQ (XCONS (tail
)->car
, Qt
))
2443 default_reg_bits
|= 1 << i
;
2444 tail
= XCONS (tail
)->cdr
;
2448 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2450 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2451 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2454 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2456 /* REG 1 can be used only by locking shift in 7-bit env. */
2457 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2458 default_reg_bits
&= ~2;
2459 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2460 /* Without any shifting, only REG 0 and 1 can be used. */
2461 default_reg_bits
&= 3;
2464 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2465 if (CHARSET_VALID_P (charset
)
2466 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2467 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
2469 /* We have not yet decided where to designate CHARSET. */
2470 int reg_bits
= default_reg_bits
;
2472 if (CHARSET_CHARS (charset
) == 96)
2473 /* A charset of CHARS96 can't be designated to REG 0. */
2477 /* There exist some default graphic register. */
2478 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2480 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2482 /* We anyway have to designate CHARSET to somewhere. */
2483 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2484 = (CHARSET_CHARS (charset
) == 94
2486 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2487 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2489 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2493 coding
->require_flushing
= 1;
2497 coding
->type
= coding_type_big5
;
2499 = (NILP (XVECTOR (coding_system
)->contents
[4])
2500 ? CODING_FLAG_BIG5_HKU
2501 : CODING_FLAG_BIG5_ETEN
);
2505 coding
->type
= coding_type_ccl
;
2507 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2509 && VECTORP (XCONS (val
)->car
)
2510 && VECTORP (XCONS (val
)->cdr
))
2512 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2513 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2516 goto label_invalid_coding_system
;
2518 coding
->require_flushing
= 1;
2523 coding
->type
= coding_type_undecided
;
2525 coding
->type
= coding_type_no_conversion
;
2530 label_invalid_coding_system
:
2531 coding
->type
= coding_type_no_conversion
;
2532 coding
->eol_type
= CODING_EOL_LF
;
2533 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2538 /* Emacs has a mechanism to automatically detect a coding system if it
2539 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2540 it's impossible to distinguish some coding systems accurately
2541 because they use the same range of codes. So, at first, coding
2542 systems are categorized into 7, those are:
2544 o coding-category-emacs-mule
2546 The category for a coding system which has the same code range
2547 as Emacs' internal format. Assigned the coding-system (Lisp
2548 symbol) `emacs-mule' by default.
2550 o coding-category-sjis
2552 The category for a coding system which has the same code range
2553 as SJIS. Assigned the coding-system (Lisp
2554 symbol) `japanese-shift-jis' by default.
2556 o coding-category-iso-7
2558 The category for a coding system which has the same code range
2559 as ISO2022 of 7-bit environment. This doesn't use any locking
2560 shift and single shift functions. Assigned the coding-system
2561 (Lisp symbol) `iso-2022-7bit' by default.
2563 o coding-category-iso-8-1
2565 The category for a coding system which has the same code range
2566 as ISO2022 of 8-bit environment and graphic plane 1 used only
2567 for DIMENSION1 charset. This doesn't use any locking shift
2568 and single shift functions. Assigned the coding-system (Lisp
2569 symbol) `iso-latin-1' by default.
2571 o coding-category-iso-8-2
2573 The category for a coding system which has the same code range
2574 as ISO2022 of 8-bit environment and graphic plane 1 used only
2575 for DIMENSION2 charset. This doesn't use any locking shift
2576 and single shift functions. Assigned the coding-system (Lisp
2577 symbol) `japanese-iso-8bit' by default.
2579 o coding-category-iso-7-else
2581 The category for a coding system which has the same code range
2582 as ISO2022 of 7-bit environemnt but uses locking shift or
2583 single shift functions. Assigned the coding-system (Lisp
2584 symbol) `iso-2022-7bit-lock' by default.
2586 o coding-category-iso-8-else
2588 The category for a coding system which has the same code range
2589 as ISO2022 of 8-bit environemnt but uses locking shift or
2590 single shift functions. Assigned the coding-system (Lisp
2591 symbol) `iso-2022-8bit-ss2' by default.
2593 o coding-category-big5
2595 The category for a coding system which has the same code range
2596 as BIG5. Assigned the coding-system (Lisp symbol)
2597 `cn-big5' by default.
2599 o coding-category-binary
2601 The category for a coding system not categorized in any of the
2602 above. Assigned the coding-system (Lisp symbol)
2603 `no-conversion' by default.
2605 Each of them is a Lisp symbol and the value is an actual
2606 `coding-system's (this is also a Lisp symbol) assigned by a user.
2607 What Emacs does actually is to detect a category of coding system.
2608 Then, it uses a `coding-system' assigned to it. If Emacs can't
2609 decide only one possible category, it selects a category of the
2610 highest priority. Priorities of categories are also specified by a
2611 user in a Lisp variable `coding-category-list'.
2615 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2616 If it detects possible coding systems, return an integer in which
2617 appropriate flag bits are set. Flag bits are defined by macros
2618 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2621 detect_coding_mask (src
, src_bytes
)
2625 register unsigned char c
;
2626 unsigned char *src_end
= src
+ src_bytes
;
2629 /* At first, skip all ASCII characters and control characters except
2630 for three ISO2022 specific control characters. */
2631 label_loop_detect_coding
:
2632 while (src
< src_end
)
2636 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2642 /* We found nothing other than ASCII. There's nothing to do. */
2643 return CODING_CATEGORY_MASK_ANY
;
2645 /* The text seems to be encoded in some multilingual coding system.
2646 Now, try to find in which coding system the text is encoded. */
2649 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2650 /* C is an ISO2022 specific control code of C0. */
2651 mask
= detect_coding_iso2022 (src
, src_end
);
2653 if (mask
== CODING_CATEGORY_MASK_ANY
)
2654 /* No valid ISO2022 code follows C. Try again. */
2655 goto label_loop_detect_coding
;
2659 /* If C is a special Microsoft code,
2660 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2661 or is an ISO2022 control-sequence-introducer (CSI),
2662 we should also consider the possibility of someof ISO2022 codings. */
2663 if ((VECTORP (Vmicrosoft_code_table
)
2664 && !NILP (XVECTOR (Vmicrosoft_code_table
)->contents
[c
]))
2665 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
2666 || (c
== ISO_CODE_CSI
2669 || (src
+ 1 < src_end
2671 && (*src
== '0' || *src
== '1' || *src
== '2'))))))
2672 mask
= (detect_coding_iso2022 (src
, src_end
)
2673 | detect_coding_sjis (src
, src_end
)
2674 | detect_coding_emacs_mule (src
, src_end
)
2675 | CODING_CATEGORY_MASK_BINARY
);
2678 /* C is the first byte of SJIS character code, or a
2679 leading-code of Emacs. */
2680 mask
= (detect_coding_sjis (src
, src_end
)
2681 | detect_coding_emacs_mule (src
, src_end
)
2682 | CODING_CATEGORY_MASK_BINARY
);
2685 /* C is a character of ISO2022 in graphic plane right,
2686 or a SJIS's 1-byte character code (i.e. JISX0201),
2687 or the first byte of BIG5's 2-byte code. */
2688 mask
= (detect_coding_iso2022 (src
, src_end
)
2689 | detect_coding_sjis (src
, src_end
)
2690 | detect_coding_big5 (src
, src_end
)
2691 | CODING_CATEGORY_MASK_BINARY
);
2696 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2697 The information of the detected coding system is set in CODING. */
2700 detect_coding (coding
, src
, src_bytes
)
2701 struct coding_system
*coding
;
2705 int mask
= detect_coding_mask (src
, src_bytes
);
2708 if (mask
== CODING_CATEGORY_MASK_ANY
)
2709 /* We found nothing other than ASCII. There's nothing to do. */
2713 /* The source text seems to be encoded in unknown coding system.
2714 Emacs regards the category of such a kind of coding system as
2715 `coding-category-binary'. We assume that a user has assigned
2716 an appropriate coding system for a `coding-category-binary'. */
2717 idx
= CODING_CATEGORY_IDX_BINARY
;
2720 /* We found some plausible coding systems. Let's use a coding
2721 system of the highest priority. */
2722 Lisp_Object val
= Vcoding_category_list
;
2727 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2728 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2730 val
= XCONS (val
)->cdr
;
2737 /* For unknown reason, `Vcoding_category_list' contains none
2738 of found categories. Let's use any of them. */
2739 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2740 if (mask
& (1 << idx
))
2744 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2747 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2748 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2749 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2751 #define MAX_EOL_CHECK_COUNT 3
2754 detect_eol_type (src
, src_bytes
)
2758 unsigned char *src_end
= src
+ src_bytes
;
2760 int total
= 0; /* How many end-of-lines are found so far. */
2761 int eol_type
= CODING_EOL_UNDECIDED
;
2764 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
2767 if (c
== '\n' || c
== '\r')
2771 this_eol_type
= CODING_EOL_LF
;
2772 else if (src
>= src_end
|| *src
!= '\n')
2773 this_eol_type
= CODING_EOL_CR
;
2775 this_eol_type
= CODING_EOL_CRLF
, src
++;
2777 if (eol_type
== CODING_EOL_UNDECIDED
)
2778 /* This is the first end-of-line. */
2779 eol_type
= this_eol_type
;
2780 else if (eol_type
!= this_eol_type
)
2781 /* The found type is different from what found before.
2782 We had better not decode end-of-line. */
2783 return CODING_EOL_LF
;
2790 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2791 is encoded. If it detects an appropriate format of end-of-line, it
2792 sets the information in *CODING. */
2795 detect_eol (coding
, src
, src_bytes
)
2796 struct coding_system
*coding
;
2801 int eol_type
= detect_eol_type (src
, src_bytes
);
2803 if (eol_type
== CODING_EOL_UNDECIDED
)
2804 /* We found no end-of-line in the source text. */
2807 val
= Fget (coding
->symbol
, Qeol_type
);
2808 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2809 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2812 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2813 decoding, it may detect coding system and format of end-of-line if
2814 those are not yet decided. */
2817 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2818 struct coding_system
*coding
;
2819 unsigned char *source
, *destination
;
2820 int src_bytes
, dst_bytes
;
2831 if (coding
->type
== coding_type_undecided
)
2832 detect_coding (coding
, source
, src_bytes
);
2834 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
2835 detect_eol (coding
, source
, src_bytes
);
2837 coding
->carryover_size
= 0;
2838 switch (coding
->type
)
2840 case coding_type_no_conversion
:
2841 label_no_conversion
:
2842 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2843 bcopy (source
, destination
, produced
);
2844 *consumed
= produced
;
2847 case coding_type_emacs_mule
:
2848 case coding_type_undecided
:
2849 if (coding
->eol_type
== CODING_EOL_LF
2850 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2851 goto label_no_conversion
;
2852 produced
= decode_eol (coding
, source
, destination
,
2853 src_bytes
, dst_bytes
, consumed
);
2856 case coding_type_sjis
:
2857 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2858 src_bytes
, dst_bytes
, consumed
,
2862 case coding_type_iso2022
:
2863 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2864 src_bytes
, dst_bytes
, consumed
);
2867 case coding_type_big5
:
2868 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2869 src_bytes
, dst_bytes
, consumed
,
2873 case coding_type_ccl
:
2874 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2875 src_bytes
, dst_bytes
, consumed
);
2882 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2885 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2886 struct coding_system
*coding
;
2887 unsigned char *source
, *destination
;
2888 int src_bytes
, dst_bytes
;
2893 switch (coding
->type
)
2895 case coding_type_no_conversion
:
2896 label_no_conversion
:
2897 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2900 bcopy (source
, destination
, produced
);
2901 if (coding
->selective
)
2903 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2905 if (*p
++ == '\015') p
[-1] = '\n';
2908 *consumed
= produced
;
2911 case coding_type_emacs_mule
:
2912 case coding_type_undecided
:
2913 if (coding
->eol_type
== CODING_EOL_LF
2914 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2915 goto label_no_conversion
;
2916 produced
= encode_eol (coding
, source
, destination
,
2917 src_bytes
, dst_bytes
, consumed
);
2920 case coding_type_sjis
:
2921 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2922 src_bytes
, dst_bytes
, consumed
,
2926 case coding_type_iso2022
:
2927 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2928 src_bytes
, dst_bytes
, consumed
);
2931 case coding_type_big5
:
2932 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2933 src_bytes
, dst_bytes
, consumed
,
2937 case coding_type_ccl
:
2938 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2939 src_bytes
, dst_bytes
, consumed
);
2946 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2948 /* Return maximum size (bytes) of a buffer enough for decoding
2949 SRC_BYTES of text encoded in CODING. */
2952 decoding_buffer_size (coding
, src_bytes
)
2953 struct coding_system
*coding
;
2958 if (coding
->type
== coding_type_iso2022
)
2960 else if (coding
->type
== coding_type_ccl
)
2961 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
2965 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2968 /* Return maximum size (bytes) of a buffer enough for encoding
2969 SRC_BYTES of text to CODING. */
2972 encoding_buffer_size (coding
, src_bytes
)
2973 struct coding_system
*coding
;
2978 if (coding
->type
== coding_type_ccl
)
2979 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
2983 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2986 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2987 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2990 char *conversion_buffer
;
2991 int conversion_buffer_size
;
2993 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2994 or decoding. Sufficient memory is allocated automatically. If we
2995 run out of memory, return NULL. */
2998 get_conversion_buffer (size
)
3001 if (size
> conversion_buffer_size
)
3004 int real_size
= conversion_buffer_size
* 2;
3006 while (real_size
< size
) real_size
*= 2;
3007 buf
= (char *) xmalloc (real_size
);
3008 xfree (conversion_buffer
);
3009 conversion_buffer
= buf
;
3010 conversion_buffer_size
= real_size
;
3012 return conversion_buffer
;
3017 /*** 7. Emacs Lisp library functions ***/
3019 DEFUN ("coding-system-spec", Fcoding_system_spec
, Scoding_system_spec
,
3021 "Return coding-spec of CODING-SYSTEM.\n\
3022 If CODING-SYSTEM is not a valid coding-system, return nil.")
3026 while (SYMBOLP (obj
) && !NILP (obj
))
3027 obj
= Fget (obj
, Qcoding_system
);
3028 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
3032 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
3033 "Return t if OBJECT is nil or a coding-system.\n\
3034 See document of make-coding-system for coding-system object.")
3038 return ((NILP (obj
) || !NILP (Fcoding_system_spec (obj
))) ? Qt
: Qnil
);
3041 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
3042 Sread_non_nil_coding_system
, 1, 1, 0,
3043 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3050 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_spec
,
3051 Qt
, Qnil
, Qnil
, Qnil
);
3053 while (XSTRING (val
)->size
== 0);
3054 return (Fintern (val
, Qnil
));
3057 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
3058 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3062 Lisp_Object val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
3063 Qt
, Qnil
, Qnil
, Qnil
);
3064 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
3067 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
3069 "Check validity of CODING-SYSTEM.\n\
3070 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3071 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3072 The value of property should be a vector of length 5.")
3074 Lisp_Object coding_system
;
3076 CHECK_SYMBOL (coding_system
, 0);
3077 if (!NILP (Fcoding_system_p (coding_system
)))
3078 return coding_system
;
3080 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
3083 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
3085 "Detect coding system of the text in the region between START and END.\n\
3086 Return a list of possible coding systems ordered by priority.\n\
3087 If only ASCII characters are found, it returns `undecided'\n\
3088 or its subsidiary coding system according to a detected end-of-line format.")
3092 int coding_mask
, eol_type
;
3096 validate_region (&b
, &e
);
3097 beg
= XINT (b
), end
= XINT (e
);
3098 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
3100 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
3101 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
3103 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
3105 val
= intern ("undecided");
3106 if (eol_type
!= CODING_EOL_UNDECIDED
)
3108 Lisp_Object val2
= Fget (val
, Qeol_type
);
3110 val
= XVECTOR (val2
)->contents
[eol_type
];
3117 /* At first, gather possible coding-systems in VAL in a reverse
3120 for (val2
= Vcoding_category_list
;
3122 val2
= XCONS (val2
)->cdr
)
3125 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
3126 if (coding_mask
& (1 << idx
))
3127 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
3130 /* Then, change the order of the list, while getting subsidiary
3134 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
3136 if (eol_type
== CODING_EOL_UNDECIDED
)
3137 val
= Fcons (XCONS (val2
)->car
, val
);
3140 Lisp_Object val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
3142 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
3144 val
= Fcons (XCONS (val2
)->car
, val
);
3152 /* Scan text in the region between *BEGP and *ENDP, skip characters
3153 which we never have to encode to (iff ENCODEP is 1) or decode from
3154 coding system CODING at the head and tail, then set BEGP and ENDP
3155 to the addresses of start and end of the text we actually convert. */
3158 shrink_conversion_area (begp
, endp
, coding
, encodep
)
3159 unsigned char **begp
, **endp
;
3160 struct coding_system
*coding
;
3163 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
3165 if (coding
->eol_type
!= CODING_EOL_LF
3166 && coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3167 /* Since we anyway have to convert end-of-line format, it is not
3168 worth skipping at most 100 bytes or so. */
3171 if (encodep
) /* for encoding */
3173 switch (coding
->type
)
3175 case coding_type_no_conversion
:
3176 case coding_type_emacs_mule
:
3177 case coding_type_undecided
:
3178 /* We need no conversion. */
3181 case coding_type_ccl
:
3182 /* We can't skip any data. */
3184 case coding_type_iso2022
:
3185 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3187 unsigned char *bol
= beg_addr
;
3188 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
3191 if (*(beg_addr
- 1) == '\n')
3195 goto label_skip_tail
;
3199 /* We can skip all ASCII characters at the head and tail. */
3200 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3202 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3206 else /* for decoding */
3208 switch (coding
->type
)
3210 case coding_type_no_conversion
:
3211 /* We need no conversion. */
3214 case coding_type_emacs_mule
:
3215 if (coding
->eol_type
== CODING_EOL_LF
)
3217 /* We need no conversion. */
3221 /* We can skip all but carriage-return. */
3222 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
3223 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
3225 case coding_type_sjis
:
3226 case coding_type_big5
:
3227 /* We can skip all ASCII characters at the head. */
3228 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3229 /* We can skip all ASCII characters at the tail except for
3230 the second byte of SJIS or BIG5 code. */
3231 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3232 if (end_addr
!= *endp
)
3235 case coding_type_ccl
:
3236 /* We can't skip any data. */
3238 default: /* i.e. case coding_type_iso2022: */
3242 /* We can skip all ASCII characters except for a few
3243 control codes at the head. */
3244 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
3245 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3246 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
3257 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3258 text between B and E. B and E are buffer position. */
3261 code_convert_region (b
, e
, coding
, encodep
)
3263 struct coding_system
*coding
;
3266 int beg
, end
, len
, consumed
, produced
;
3268 unsigned char *begp
, *endp
;
3271 validate_region (&b
, &e
);
3272 beg
= XINT (b
), end
= XINT (e
);
3273 if (beg
< GPT
&& end
>= GPT
)
3276 if (encodep
&& !NILP (coding
->pre_write_conversion
))
3278 /* We must call a pre-conversion function which may put a new
3279 text to be converted in a new buffer. */
3280 struct buffer
*old
= current_buffer
, *new;
3283 call2 (coding
->pre_write_conversion
, b
, e
);
3284 if (old
!= current_buffer
)
3286 /* Replace the original text by the text just generated. */
3288 new = current_buffer
;
3289 set_buffer_internal (old
);
3290 del_range (beg
, end
);
3291 insert_from_buffer (new, 1, len
, 0);
3296 /* We may be able to shrink the conversion region. */
3297 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
3298 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3301 /* We need no conversion. */
3305 beg
+= begp
- POS_ADDR (beg
);
3306 end
= beg
+ (endp
- begp
);
3309 len
= encoding_buffer_size (coding
, end
- beg
);
3311 len
= decoding_buffer_size (coding
, end
- beg
);
3312 buf
= get_conversion_buffer (len
);
3314 coding
->last_block
= 1;
3316 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3318 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3321 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3324 insert (buf
, produced
);
3325 del_range (PT
, PT
+ end
- beg
);
3327 pos
= PT
+ (pos
- end
);
3333 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3335 /* We must call a post-conversion function which may alter
3336 the text just converted. */
3341 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3342 CHECK_NUMBER (insval
, 0);
3343 len
= XINT (insval
);
3346 return make_number (len
);
3350 code_convert_string (str
, coding
, encodep
, nocopy
)
3351 Lisp_Object str
, nocopy
;
3352 struct coding_system
*coding
;
3355 int len
, consumed
, produced
;
3357 unsigned char *begp
, *endp
;
3358 int head_skip
, tail_skip
;
3359 struct gcpro gcpro1
;
3361 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3362 || !encodep
&& !NILP (coding
->post_read_conversion
))
3364 /* Since we have to call Lisp functions which assume target text
3365 is in a buffer, after setting a temporary buffer, call
3366 code_convert_region. */
3367 int count
= specpdl_ptr
- specpdl
;
3368 int len
= XSTRING (str
)->size
;
3370 struct buffer
*old
= current_buffer
;
3372 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3373 temp_output_buffer_setup (" *code-converting-work*");
3374 set_buffer_internal (XBUFFER (Vstandard_output
));
3375 insert_from_string (str
, 0, len
, 0);
3376 code_convert_region (make_number (BEGV
), make_number (ZV
),
3378 result
= make_buffer_string (BEGV
, ZV
, 0);
3379 set_buffer_internal (old
);
3380 return unbind_to (count
, result
);
3383 /* We may be able to shrink the conversion region. */
3384 begp
= XSTRING (str
)->data
;
3385 endp
= begp
+ XSTRING (str
)->size
;
3386 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3389 /* We need no conversion. */
3390 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3392 head_skip
= begp
- XSTRING (str
)->data
;
3393 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3398 len
= encoding_buffer_size (coding
, endp
- begp
);
3400 len
= decoding_buffer_size (coding
, endp
- begp
);
3401 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3403 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3404 coding
->last_block
= 1;
3406 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3407 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3408 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3409 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3410 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3411 buf
+ head_skip
+ produced
,
3416 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3419 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3420 3, 3, "r\nzCoding system: ",
3421 "Decode current region by specified coding system.\n\
3422 When called from a program, takes three arguments:\n\
3423 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3424 Return length of decoded text.")
3425 (b
, e
, coding_system
)
3426 Lisp_Object b
, e
, coding_system
;
3428 struct coding_system coding
;
3430 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3431 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3432 CHECK_SYMBOL (coding_system
, 2);
3434 if (NILP (coding_system
))
3435 return make_number (XFASTINT (e
) - XFASTINT (b
));
3436 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3437 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3439 return code_convert_region (b
, e
, &coding
, 0);
3442 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3443 3, 3, "r\nzCoding system: ",
3444 "Encode current region by specified coding system.\n\
3445 When called from a program, takes three arguments:\n\
3446 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3447 Return length of encoded text.")
3448 (b
, e
, coding_system
)
3449 Lisp_Object b
, e
, coding_system
;
3451 struct coding_system coding
;
3453 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3454 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3455 CHECK_SYMBOL (coding_system
, 2);
3457 if (NILP (coding_system
))
3458 return make_number (XFASTINT (e
) - XFASTINT (b
));
3459 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3460 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3462 return code_convert_region (b
, e
, &coding
, 1);
3465 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3467 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3468 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3470 (string
, coding_system
, nocopy
)
3471 Lisp_Object string
, coding_system
, nocopy
;
3473 struct coding_system coding
;
3475 CHECK_STRING (string
, 0);
3476 CHECK_SYMBOL (coding_system
, 1);
3478 if (NILP (coding_system
))
3479 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3480 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3481 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3483 return code_convert_string (string
, &coding
, 0, nocopy
);
3486 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3488 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3489 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3491 (string
, coding_system
, nocopy
)
3492 Lisp_Object string
, coding_system
, nocopy
;
3494 struct coding_system coding
;
3496 CHECK_STRING (string
, 0);
3497 CHECK_SYMBOL (coding_system
, 1);
3499 if (NILP (coding_system
))
3500 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3501 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3502 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3504 return code_convert_string (string
, &coding
, 1, nocopy
);
3507 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3508 "Decode a JISX0208 character of shift-jis encoding.\n\
3509 CODE is the character code in SJIS.\n\
3510 Return the corresponding character.")
3514 unsigned char c1
, c2
, s1
, s2
;
3517 CHECK_NUMBER (code
, 0);
3518 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3519 DECODE_SJIS (s1
, s2
, c1
, c2
);
3520 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3524 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3525 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3526 Return the corresponding character code in SJIS.")
3530 int charset
, c1
, c2
, s1
, s2
;
3533 CHECK_NUMBER (ch
, 0);
3534 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3535 if (charset
== charset_jisx0208
)
3537 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3538 XSETFASTINT (val
, (s1
<< 8) | s2
);
3541 XSETFASTINT (val
, 0);
3545 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3546 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3547 CODE is the character code in BIG5.\n\
3548 Return the corresponding character.")
3553 unsigned char b1
, b2
, c1
, c2
;
3556 CHECK_NUMBER (code
, 0);
3557 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3558 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3559 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3563 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3564 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3565 Return the corresponding character code in Big5.")
3569 int charset
, c1
, c2
, b1
, b2
;
3572 CHECK_NUMBER (ch
, 0);
3573 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3574 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3576 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3577 XSETFASTINT (val
, (b1
<< 8) | b2
);
3580 XSETFASTINT (val
, 0);
3584 DEFUN ("set-terminal-coding-system-internal",
3585 Fset_terminal_coding_system_internal
,
3586 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
3588 Lisp_Object coding_system
;
3590 CHECK_SYMBOL (coding_system
, 0);
3591 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3592 /* We had better not send unexpected characters to terminal. */
3593 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
3598 DEFUN ("set-safe-terminal-coding-system-internal",
3599 Fset_safe_terminal_coding_system_internal
,
3600 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
3602 Lisp_Object coding_system
;
3604 CHECK_SYMBOL (coding_system
, 0);
3605 setup_coding_system (Fcheck_coding_system (coding_system
),
3606 &safe_terminal_coding
);
3610 DEFUN ("terminal-coding-system",
3611 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3612 "Return coding-system of your terminal.")
3615 return terminal_coding
.symbol
;
3618 DEFUN ("set-keyboard-coding-system-internal",
3619 Fset_keyboard_coding_system_internal
,
3620 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
3622 Lisp_Object coding_system
;
3624 CHECK_SYMBOL (coding_system
, 0);
3625 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3629 DEFUN ("keyboard-coding-system",
3630 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3631 "Return coding-system of what is sent from terminal keyboard.")
3634 return keyboard_coding
.symbol
;
3638 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
3639 Sfind_operation_coding_system
, 1, MANY
, 0,
3640 "Choose a coding system for an operation based on the target name.\n\
3641 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3642 DECODING-SYSTEM is the coding system to use for decoding\n\
3643 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3644 for encoding (in case OPERATION does encoding).\n\
3646 The first argument OPERATION specifies an I/O primitive:\n\
3647 For file I/O, `insert-file-contents' or `write-region'.\n\
3648 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3649 For network I/O, `open-network-stream'.\n\
3651 The remaining arguments should be the same arguments that were passed\n\
3652 to the primitive. Depending on which primitive, one of those arguments\n\
3653 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3654 whichever argument specifies the file name is TARGET.\n\
3656 TARGET has a meaning which depends on OPERATION:\n\
3657 For file I/O, TARGET is a file name.\n\
3658 For process I/O, TARGET is a process name.\n\
3659 For network I/O, TARGET is a service name or a port number\n\
3661 This function looks up what specified for TARGET in,\n\
3662 `file-coding-system-alist', `process-coding-system-alist',\n\
3663 or `network-coding-system-alist' depending on OPERATION.\n\
3664 They may specify a coding system, a cons of coding systems,\n\
3665 or a function symbol to call.\n\
3666 In the last case, we call the function with one argument,\n\
3667 which is a list of all the arguments given to this function.")
3672 Lisp_Object operation
, target_idx
, target
, val
;
3673 register Lisp_Object chain
;
3676 error ("Too few arguments");
3677 operation
= args
[0];
3678 if (!SYMBOLP (operation
)
3679 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3680 error ("Invalid first arguement");
3681 if (nargs
< 1 + XINT (target_idx
))
3682 error ("Too few arguments for operation: %s",
3683 XSYMBOL (operation
)->name
->data
);
3684 target
= args
[XINT (target_idx
) + 1];
3685 if (!(STRINGP (target
)
3686 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3687 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3689 chain
= ((EQ (operation
, Qinsert_file_contents
)
3690 || EQ (operation
, Qwrite_region
))
3691 ? Vfile_coding_system_alist
3692 : (EQ (operation
, Qopen_network_stream
)
3693 ? Vnetwork_coding_system_alist
3694 : Vprocess_coding_system_alist
));
3698 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3700 Lisp_Object elt
= XCONS (chain
)->car
;
3703 && ((STRINGP (target
)
3704 && STRINGP (XCONS (elt
)->car
)
3705 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3706 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3708 val
= XCONS (elt
)->cdr
;
3711 if (! SYMBOLP (val
))
3713 if (! NILP (Fcoding_system_p (val
)))
3714 return Fcons (val
, val
);
3715 if (!NILP (Ffboundp (val
)))
3716 return call1 (val
, Flist (nargs
, args
));
3726 /*** 8. Post-amble ***/
3732 /* Emacs' internal format specific initialize routine. */
3733 for (i
= 0; i
<= 0x20; i
++)
3734 emacs_code_class
[i
] = EMACS_control_code
;
3735 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3736 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3737 for (i
= 0x21 ; i
< 0x7F; i
++)
3738 emacs_code_class
[i
] = EMACS_ascii_code
;
3739 emacs_code_class
[0x7F] = EMACS_control_code
;
3740 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3741 for (i
= 0x81; i
< 0xFF; i
++)
3742 emacs_code_class
[i
] = EMACS_invalid_code
;
3743 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3744 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3745 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3746 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3748 /* ISO2022 specific initialize routine. */
3749 for (i
= 0; i
< 0x20; i
++)
3750 iso_code_class
[i
] = ISO_control_code
;
3751 for (i
= 0x21; i
< 0x7F; i
++)
3752 iso_code_class
[i
] = ISO_graphic_plane_0
;
3753 for (i
= 0x80; i
< 0xA0; i
++)
3754 iso_code_class
[i
] = ISO_control_code
;
3755 for (i
= 0xA1; i
< 0xFF; i
++)
3756 iso_code_class
[i
] = ISO_graphic_plane_1
;
3757 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3758 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3759 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3760 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3761 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3762 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3763 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3764 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3765 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3766 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3768 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3769 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3771 setup_coding_system (Qnil
, &keyboard_coding
);
3772 setup_coding_system (Qnil
, &terminal_coding
);
3773 setup_coding_system (Qnil
, &safe_terminal_coding
);
3775 #if defined (MSDOS) || defined (WINDOWSNT)
3776 system_eol_type
= CODING_EOL_CRLF
;
3778 system_eol_type
= CODING_EOL_LF
;
3786 Qtarget_idx
= intern ("target-idx");
3787 staticpro (&Qtarget_idx
);
3789 /* Target FILENAME is the first argument. */
3790 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3791 /* Target FILENAME is the third argument. */
3792 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3794 Qcall_process
= intern ("call-process");
3795 staticpro (&Qcall_process
);
3796 /* Target PROGRAM is the first argument. */
3797 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3799 Qcall_process_region
= intern ("call-process-region");
3800 staticpro (&Qcall_process_region
);
3801 /* Target PROGRAM is the third argument. */
3802 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3804 Qstart_process
= intern ("start-process");
3805 staticpro (&Qstart_process
);
3806 /* Target PROGRAM is the third argument. */
3807 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3809 Qopen_network_stream
= intern ("open-network-stream");
3810 staticpro (&Qopen_network_stream
);
3811 /* Target SERVICE is the fourth argument. */
3812 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3814 Qcoding_system
= intern ("coding-system");
3815 staticpro (&Qcoding_system
);
3817 Qeol_type
= intern ("eol-type");
3818 staticpro (&Qeol_type
);
3820 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3821 staticpro (&Qbuffer_file_coding_system
);
3823 Qpost_read_conversion
= intern ("post-read-conversion");
3824 staticpro (&Qpost_read_conversion
);
3826 Qpre_write_conversion
= intern ("pre-write-conversion");
3827 staticpro (&Qpre_write_conversion
);
3829 Qcoding_system_spec
= intern ("coding-system-spec");
3830 staticpro (&Qcoding_system_spec
);
3832 Qcoding_system_p
= intern ("coding-system-p");
3833 staticpro (&Qcoding_system_p
);
3835 Qcoding_system_error
= intern ("coding-system-error");
3836 staticpro (&Qcoding_system_error
);
3838 Fput (Qcoding_system_error
, Qerror_conditions
,
3839 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3840 Fput (Qcoding_system_error
, Qerror_message
,
3841 build_string ("Invalid coding system"));
3843 Qcoding_category_index
= intern ("coding-category-index");
3844 staticpro (&Qcoding_category_index
);
3848 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3850 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3851 staticpro (&coding_category_table
[i
]);
3852 Fput (coding_category_table
[i
], Qcoding_category_index
,
3857 Qcharacter_unification_table
= intern ("character-unification-table");
3858 staticpro (&Qcharacter_unification_table
);
3859 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
3862 Qcharacter_unification_table_for_decode
3863 = intern ("character-unification-table-for-decode");
3864 staticpro (&Qcharacter_unification_table_for_decode
);
3866 Qcharacter_unification_table_for_encode
3867 = intern ("character-unification-table-for-encode");
3868 staticpro (&Qcharacter_unification_table_for_encode
);
3870 Qemacs_mule
= intern ("emacs-mule");
3871 staticpro (&Qemacs_mule
);
3873 defsubr (&Scoding_system_spec
);
3874 defsubr (&Scoding_system_p
);
3875 defsubr (&Sread_coding_system
);
3876 defsubr (&Sread_non_nil_coding_system
);
3877 defsubr (&Scheck_coding_system
);
3878 defsubr (&Sdetect_coding_region
);
3879 defsubr (&Sdecode_coding_region
);
3880 defsubr (&Sencode_coding_region
);
3881 defsubr (&Sdecode_coding_string
);
3882 defsubr (&Sencode_coding_string
);
3883 defsubr (&Sdecode_sjis_char
);
3884 defsubr (&Sencode_sjis_char
);
3885 defsubr (&Sdecode_big5_char
);
3886 defsubr (&Sencode_big5_char
);
3887 defsubr (&Sset_terminal_coding_system_internal
);
3888 defsubr (&Sset_safe_terminal_coding_system_internal
);
3889 defsubr (&Sterminal_coding_system
);
3890 defsubr (&Sset_keyboard_coding_system_internal
);
3891 defsubr (&Skeyboard_coding_system
);
3892 defsubr (&Sfind_operation_coding_system
);
3894 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3895 "List of coding-categories (symbols) ordered by priority.");
3899 Vcoding_category_list
= Qnil
;
3900 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3901 Vcoding_category_list
3902 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3905 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3906 "Specify the coding system for read operations.\n\
3907 It is useful to bind this variable with `let', but do not set it globally.\n\
3908 If the value is a coding system, it is used for decoding on read operation.\n\
3909 If not, an appropriate element is used from one of the coding system alists:\n\
3910 There are three such tables, `file-coding-system-alist',\n\
3911 `process-coding-system-alist', and `network-coding-system-alist'.");
3912 Vcoding_system_for_read
= Qnil
;
3914 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
3915 "Specify the coding system for write operations.\n\
3916 It is useful to bind this variable with `let', but do not set it globally.\n\
3917 If the value is a coding system, it is used for encoding on write operation.\n\
3918 If not, an appropriate element is used from one of the coding system alists:\n\
3919 There are three such tables, `file-coding-system-alist',\n\
3920 `process-coding-system-alist', and `network-coding-system-alist'.");
3921 Vcoding_system_for_write
= Qnil
;
3923 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
3924 "Coding system used in the latest file or process I/O.");
3925 Vlast_coding_system_used
= Qnil
;
3927 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
3928 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3929 inhibit_eol_conversion
= 0;
3931 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
3932 "Alist to decide a coding system to use for a file I/O operation.\n\
3933 The format is ((PATTERN . VAL) ...),\n\
3934 where PATTERN is a regular expression matching a file name,\n\
3935 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3936 If VAL is a coding system, it is used for both decoding and encoding\n\
3937 the file contents.\n\
3938 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3939 and the cdr part is used for encoding.\n\
3940 If VAL is a function symbol, the function must return a coding system\n\
3941 or a cons of coding systems which are used as above.\n\
3943 See also the function `find-operation-coding-system'.");
3944 Vfile_coding_system_alist
= Qnil
;
3946 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
3947 "Alist to decide a coding system to use for a process I/O operation.\n\
3948 The format is ((PATTERN . VAL) ...),\n\
3949 where PATTERN is a regular expression matching a program name,\n\
3950 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3951 If VAL is a coding system, it is used for both decoding what received\n\
3952 from the program and encoding what sent to the program.\n\
3953 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3954 and the cdr part is used for encoding.\n\
3955 If VAL is a function symbol, the function must return a coding system\n\
3956 or a cons of coding systems which are used as above.\n\
3958 See also the function `find-operation-coding-system'.");
3959 Vprocess_coding_system_alist
= Qnil
;
3961 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
3962 "Alist to decide a coding system to use for a network I/O operation.\n\
3963 The format is ((PATTERN . VAL) ...),\n\
3964 where PATTERN is a regular expression matching a network service name\n\
3965 or is a port number to connect to,\n\
3966 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3967 If VAL is a coding system, it is used for both decoding what received\n\
3968 from the network stream and encoding what sent to the network stream.\n\
3969 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3970 and the cdr part is used for encoding.\n\
3971 If VAL is a function symbol, the function must return a coding system\n\
3972 or a cons of coding systems which are used as above.\n\
3974 See also the function `find-operation-coding-system'.");
3975 Vnetwork_coding_system_alist
= Qnil
;
3977 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
3978 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3979 eol_mnemonic_unix
= ':';
3981 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
3982 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3983 eol_mnemonic_dos
= '\\';
3985 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
3986 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3987 eol_mnemonic_mac
= '/';
3989 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
3990 "Mnemonic character indicating end-of-line format is not yet decided.");
3991 eol_mnemonic_undecided
= ':';
3993 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
3994 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3995 Venable_character_unification
= Qt
;
3997 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3998 &Vstandard_character_unification_table_for_decode
,
3999 "Table for unifying characters when reading.");
4000 Vstandard_character_unification_table_for_decode
= Qnil
;
4002 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4003 &Vstandard_character_unification_table_for_encode
,
4004 "Table for unifying characters when writing.");
4005 Vstandard_character_unification_table_for_encode
= Qnil
;
4007 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
4008 "Alist of charsets vs revision numbers.\n\
4009 While encoding, if a charset (car part of an element) is found,\n\
4010 designate it with the escape sequence identifing revision (cdr part of the element).");
4011 Vcharset_revision_alist
= Qnil
;
4013 DEFVAR_LISP ("default-process-coding-system",
4014 &Vdefault_process_coding_system
,
4015 "Cons of coding systems used for process I/O by default.\n\
4016 The car part is used for decoding a process output,\n\
4017 the cdr part is used for encoding a text to be sent to a process.");
4018 Vdefault_process_coding_system
= Qnil
;
4020 DEFVAR_LISP ("special-microsoft-code-table", &Vmicrosoft_code_table
,
4021 "Table of special Microsoft codes in the range 128..159 (inclusive).\n\
4022 This is a vector of length 256.\n\
4023 If Nth element is non-nil, the existence of code N in a file\n\
4024 (or output of subprocess) doesn't prevent it to be detected as\n\
4025 a coding system of ISO 2022 variant (e.g. iso-latin-1) on reading a file\n\
4026 or reading output of a subprocess.\n\
4027 Only 128th through 159th elements has a meaning.");
4028 Vmicrosoft_code_table
= Fmake_vector (make_number (256), Qnil
);