]> code.delx.au - gnu-emacs/blob - src/coding.c
(concat): Handle bool-vectors correctly.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
74
75 5. Other
76
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
86
87 */
88
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is usually one byte of
95 `carriage-return'.
96
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
101
102 */
103
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111 #if 0
112 int
113 detect_coding_emacs_mule (src, src_end)
114 unsigned char *src, *src_end;
115 {
116 ...
117 }
118 #endif
119
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
128
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
131
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
135
136 Below is a template of these functions. */
137 #if 0
138 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
139 struct coding_system *coding;
140 unsigned char *source, *destination;
141 int src_bytes, dst_bytes;
142 {
143 ...
144 }
145 #endif
146
147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
148
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
151 a place pointed to by DESTINATION, the length of which should not
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
155
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
158
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
162
163 Below is a template of these functions. */
164 #if 0
165 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
166 struct coding_system *coding;
167 unsigned char *source, *destination;
168 int src_bytes, dst_bytes;
169 {
170 ...
171 }
172 #endif
173
174 /*** COMMONLY USED MACROS ***/
175
176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
181
182 #define ONE_MORE_BYTE(c1) \
183 do { \
184 if (src < src_end) \
185 c1 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190 #define TWO_MORE_BYTES(c1, c2) \
191 do { \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
194 else \
195 goto label_end_of_loop; \
196 } while (0)
197
198 #define THREE_MORE_BYTES(c1, c2, c3) \
199 do { \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
202 else \
203 goto label_end_of_loop; \
204 } while (0)
205
206 /* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
212
213 /* Decode one ASCII character C. */
214
215 #define DECODE_CHARACTER_ASCII(c) \
216 do { \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
219 else \
220 { \
221 *dst++ = (c); \
222 coding->produced_char++; \
223 } \
224 } while (0)
225
226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
227 position-code is C. */
228
229 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
230 do { \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
234 else \
235 { \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
238 } \
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
242 } while (0)
243
244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
245 position-codes are C1 and C2. */
246
247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
248 do { \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
251 } while (0)
252
253 \f
254 /*** 1. Preamble ***/
255
256 #include <stdio.h>
257
258 #ifdef emacs
259
260 #include <config.h>
261 #include "lisp.h"
262 #include "buffer.h"
263 #include "charset.h"
264 #include "ccl.h"
265 #include "coding.h"
266 #include "window.h"
267
268 #else /* not emacs */
269
270 #include "mulelib.h"
271
272 #endif /* not emacs */
273
274 Lisp_Object Qcoding_system, Qeol_type;
275 Lisp_Object Qbuffer_file_coding_system;
276 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
277 Lisp_Object Qno_conversion, Qundecided;
278 Lisp_Object Qcoding_system_history;
279 Lisp_Object Qsafe_charsets;
280
281 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
282 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
283 Lisp_Object Qstart_process, Qopen_network_stream;
284 Lisp_Object Qtarget_idx;
285
286 Lisp_Object Vselect_safe_coding_system_function;
287
288 /* Mnemonic character of each format of end-of-line. */
289 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
290 /* Mnemonic character to indicate format of end-of-line is not yet
291 decided. */
292 int eol_mnemonic_undecided;
293
294 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
296 int system_eol_type;
297
298 #ifdef emacs
299
300 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
301
302 Lisp_Object Qcoding_system_p, Qcoding_system_error;
303
304 /* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306 Lisp_Object Qemacs_mule, Qraw_text;
307
308 /* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310 /* Coding-system for reading files and receiving data from process. */
311 Lisp_Object Vcoding_system_for_read;
312 /* Coding-system for writing files and sending data to process. */
313 Lisp_Object Vcoding_system_for_write;
314 /* Coding-system actually used in the latest I/O. */
315 Lisp_Object Vlast_coding_system_used;
316
317 /* A vector of length 256 which contains information about special
318 Latin codes (espepcially for dealing with Microsoft code). */
319 Lisp_Object Vlatin_extra_code_table;
320
321 /* Flag to inhibit code conversion of end-of-line format. */
322 int inhibit_eol_conversion;
323
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding;
326
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding;
330
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding;
333
334 Lisp_Object Vfile_coding_system_alist;
335 Lisp_Object Vprocess_coding_system_alist;
336 Lisp_Object Vnetwork_coding_system_alist;
337
338 #endif /* emacs */
339
340 Lisp_Object Qcoding_category, Qcoding_category_index;
341
342 /* List of symbols `coding-category-xxx' ordered by priority. */
343 Lisp_Object Vcoding_category_list;
344
345 /* Table of coding categories (Lisp symbols). */
346 Lisp_Object Vcoding_category_table;
347
348 /* Table of names of symbol for each coding-category. */
349 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
350 "coding-category-emacs-mule",
351 "coding-category-sjis",
352 "coding-category-iso-7",
353 "coding-category-iso-7-tight",
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
358 "coding-category-big5",
359 "coding-category-raw-text",
360 "coding-category-binary"
361 };
362
363 /* Table pointers to coding systems corresponding to each coding
364 categories. */
365 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
366
367 /* Flag to tell if we look up unification table on character code
368 conversion. */
369 Lisp_Object Venable_character_unification;
370 /* Standard unification table to look up on decoding (reading). */
371 Lisp_Object Vstandard_character_unification_table_for_decode;
372 /* Standard unification table to look up on encoding (writing). */
373 Lisp_Object Vstandard_character_unification_table_for_encode;
374
375 Lisp_Object Qcharacter_unification_table;
376 Lisp_Object Qcharacter_unification_table_for_decode;
377 Lisp_Object Qcharacter_unification_table_for_encode;
378
379 /* Alist of charsets vs revision number. */
380 Lisp_Object Vcharset_revision_alist;
381
382 /* Default coding systems used for process I/O. */
383 Lisp_Object Vdefault_process_coding_system;
384
385 \f
386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
387
388 /* Emacs' internal format for encoding multiple character sets is a
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
400
401 There's one exception to this rule. Special leading-code
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
408 range as follows:
409
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
412 ASCII 0x00 .. 0x7F
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
416
417 */
418
419 enum emacs_code_class_type emacs_code_class[256];
420
421 /* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423 #define CHECK_CODE_RANGE_A0_FF \
424 do { \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
428 return 0; \
429 } while (0)
430
431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
434
435 int
436 detect_coding_emacs_mule (src, src_end)
437 unsigned char *src, *src_end;
438 {
439 unsigned char c;
440 int composing = 0;
441
442 while (src < src_end)
443 {
444 c = *src++;
445
446 if (composing)
447 {
448 if (c < 0xA0)
449 composing = 0;
450 else
451 c -= 0x20;
452 }
453
454 switch (emacs_code_class[c])
455 {
456 case EMACS_ascii_code:
457 case EMACS_linefeed_code:
458 break;
459
460 case EMACS_control_code:
461 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
462 return 0;
463 break;
464
465 case EMACS_invalid_code:
466 return 0;
467
468 case EMACS_leading_code_composition: /* c == 0x80 */
469 if (composing)
470 CHECK_CODE_RANGE_A0_FF;
471 else
472 composing = 1;
473 break;
474
475 case EMACS_leading_code_4:
476 CHECK_CODE_RANGE_A0_FF;
477 /* fall down to check it two more times ... */
478
479 case EMACS_leading_code_3:
480 CHECK_CODE_RANGE_A0_FF;
481 /* fall down to check it one more time ... */
482
483 case EMACS_leading_code_2:
484 CHECK_CODE_RANGE_A0_FF;
485 break;
486
487 default:
488 label_end_of_switch:
489 break;
490 }
491 }
492 return CODING_CATEGORY_MASK_EMACS_MULE;
493 }
494
495 \f
496 /*** 3. ISO2022 handlers ***/
497
498 /* The following note describes the coding system ISO2022 briefly.
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
503
504 ISO2022 provides many mechanisms to encode several character sets
505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
506 all text is encoded by codes of less than 128. This may make the
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
509
510 There are two kinds of character set: control character set and
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
513 functions are provided also by escape sequences). The latter
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
516
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
526
527 Note (*): ECMA = European Computer Manufacturers Association
528
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
534
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
540
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
550
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
558
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
562
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
568 sequences.
569
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
574 SO (shift-out) 0x0E none invoke G1 to GL
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
582
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
604
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
607
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
612
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
615
616 Now you may notice that there are a lot of ways for encoding the
617 same multilingual text in ISO2022. Actually, there exists many
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
622
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
626
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
634
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
640
641 enum iso_code_class_type iso_code_class[256];
642
643 #define CHARSET_OK(idx, charset) \
644 (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
645 (coding_system_table[idx], charset) \
646 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
647
648 #define SHIFT_OUT_OK(idx) \
649 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
650
651 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
652 Check if a text is encoded in ISO2022. If it is, returns an
653 integer in which appropriate flag bits any of:
654 CODING_CATEGORY_MASK_ISO_7
655 CODING_CATEGORY_MASK_ISO_7_TIGHT
656 CODING_CATEGORY_MASK_ISO_8_1
657 CODING_CATEGORY_MASK_ISO_8_2
658 CODING_CATEGORY_MASK_ISO_7_ELSE
659 CODING_CATEGORY_MASK_ISO_8_ELSE
660 are set. If a code which should never appear in ISO2022 is found,
661 returns 0. */
662
663 int
664 detect_coding_iso2022 (src, src_end)
665 unsigned char *src, *src_end;
666 {
667 int mask = CODING_CATEGORY_MASK_ISO;
668 int mask_found = 0;
669 int reg[4], shift_out = 0;
670 int c, c1, i, charset;
671
672 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
673 while (mask && src < src_end)
674 {
675 c = *src++;
676 switch (c)
677 {
678 case ISO_CODE_ESC:
679 if (src >= src_end)
680 break;
681 c = *src++;
682 if (c >= '(' && c <= '/')
683 {
684 /* Designation sequence for a charset of dimension 1. */
685 if (src >= src_end)
686 break;
687 c1 = *src++;
688 if (c1 < ' ' || c1 >= 0x80
689 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
690 /* Invalid designation sequence. Just ignore. */
691 break;
692 reg[(c - '(') % 4] = charset;
693 }
694 else if (c == '$')
695 {
696 /* Designation sequence for a charset of dimension 2. */
697 if (src >= src_end)
698 break;
699 c = *src++;
700 if (c >= '@' && c <= 'B')
701 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
702 reg[0] = charset = iso_charset_table[1][0][c];
703 else if (c >= '(' && c <= '/')
704 {
705 if (src >= src_end)
706 break;
707 c1 = *src++;
708 if (c1 < ' ' || c1 >= 0x80
709 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
710 /* Invalid designation sequence. Just ignore. */
711 break;
712 reg[(c - '(') % 4] = charset;
713 }
714 else
715 /* Invalid designation sequence. Just ignore. */
716 break;
717 }
718 else if (c == 'N' || c == 'n')
719 {
720 if (shift_out == 0
721 && (reg[1] >= 0
722 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
724 {
725 /* Locking shift out. */
726 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
727 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
728 shift_out = 1;
729 }
730 break;
731 }
732 else if (c == 'O' || c == 'o')
733 {
734 if (shift_out == 1)
735 {
736 /* Locking shift in. */
737 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
738 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
739 shift_out = 0;
740 }
741 break;
742 }
743 else if (c == '0' || c == '1' || c == '2')
744 /* Start/end composition. Just ignore. */
745 break;
746 else
747 /* Invalid escape sequence. Just ignore. */
748 break;
749
750 /* We found a valid designation sequence for CHARSET. */
751 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
752 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
753 mask_found |= CODING_CATEGORY_MASK_ISO_7;
754 else
755 mask &= ~CODING_CATEGORY_MASK_ISO_7;
756 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
757 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
758 else
759 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
760 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
761 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
762 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
763 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
764 break;
765
766 case ISO_CODE_SO:
767 if (shift_out == 0
768 && (reg[1] >= 0
769 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
771 {
772 /* Locking shift out. */
773 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
774 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
775 }
776 break;
777
778 case ISO_CODE_SI:
779 if (shift_out == 1)
780 {
781 /* Locking shift in. */
782 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
783 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
784 }
785 break;
786
787 case ISO_CODE_CSI:
788 case ISO_CODE_SS2:
789 case ISO_CODE_SS3:
790 {
791 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
792
793 if (c != ISO_CODE_CSI)
794 {
795 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
796 & CODING_FLAG_ISO_SINGLE_SHIFT)
797 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
798 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
799 & CODING_FLAG_ISO_SINGLE_SHIFT)
800 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
801 }
802 if (VECTORP (Vlatin_extra_code_table)
803 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
804 {
805 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
806 & CODING_FLAG_ISO_LATIN_EXTRA)
807 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
808 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
809 & CODING_FLAG_ISO_LATIN_EXTRA)
810 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
811 }
812 mask &= newmask;
813 mask_found |= newmask;
814 }
815 break;
816
817 default:
818 if (c < 0x80)
819 break;
820 else if (c < 0xA0)
821 {
822 if (VECTORP (Vlatin_extra_code_table)
823 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
824 {
825 int newmask = 0;
826
827 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
828 & CODING_FLAG_ISO_LATIN_EXTRA)
829 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
830 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
831 & CODING_FLAG_ISO_LATIN_EXTRA)
832 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
833 mask &= newmask;
834 mask_found |= newmask;
835 }
836 else
837 return 0;
838 }
839 else
840 {
841 unsigned char *src_begin = src;
842
843 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
844 | CODING_CATEGORY_MASK_ISO_7_ELSE);
845 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
846 while (src < src_end && *src >= 0xA0)
847 src++;
848 if ((src - src_begin - 1) & 1 && src < src_end)
849 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
850 else
851 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
852 }
853 break;
854 }
855 }
856
857 return (mask & mask_found);
858 }
859
860 /* Decode a character of which charset is CHARSET and the 1st position
861 code is C1. If dimension of CHARSET is 2, the 2nd position code is
862 fetched from SRC and set to C2. If CHARSET is negative, it means
863 that we are decoding ill formed text, and what we can do is just to
864 read C1 as is. */
865
866 #define DECODE_ISO_CHARACTER(charset, c1) \
867 do { \
868 int c_alt, charset_alt = (charset); \
869 if (COMPOSING_HEAD_P (coding->composing)) \
870 { \
871 *dst++ = LEADING_CODE_COMPOSITION; \
872 if (COMPOSING_WITH_RULE_P (coding->composing)) \
873 /* To tell composition rules are embeded. */ \
874 *dst++ = 0xFF; \
875 coding->composing += 2; \
876 } \
877 if ((charset) >= 0) \
878 { \
879 if (CHARSET_DIMENSION (charset) == 2) \
880 { \
881 ONE_MORE_BYTE (c2); \
882 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
883 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
884 { \
885 src--; \
886 c2 = ' '; \
887 } \
888 } \
889 if (!NILP (unification_table) \
890 && ((c_alt = unify_char (unification_table, \
891 -1, (charset), c1, c2)) >= 0)) \
892 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
893 } \
894 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
895 DECODE_CHARACTER_ASCII (c1); \
896 else if (CHARSET_DIMENSION (charset_alt) == 1) \
897 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
898 else \
899 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
900 if (COMPOSING_WITH_RULE_P (coding->composing)) \
901 /* To tell a composition rule follows. */ \
902 coding->composing = COMPOSING_WITH_RULE_RULE; \
903 } while (0)
904
905 /* Set designation state into CODING. */
906 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
907 do { \
908 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
909 make_number (chars), \
910 make_number (final_char)); \
911 if (charset >= 0 \
912 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg) \
913 { \
914 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
915 && reg == 0 \
916 && charset == CHARSET_ASCII) \
917 { \
918 /* We should insert this designation sequence as is so \
919 that it is surely written back to a file. */ \
920 coding->spec.iso2022.last_invalid_designation_register = -1; \
921 goto label_invalid_code; \
922 } \
923 coding->spec.iso2022.last_invalid_designation_register = -1; \
924 if ((coding->mode & CODING_MODE_DIRECTION) \
925 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
926 charset = CHARSET_REVERSE_CHARSET (charset); \
927 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
928 } \
929 else \
930 { \
931 coding->spec.iso2022.last_invalid_designation_register = reg; \
932 goto label_invalid_code; \
933 } \
934 } while (0)
935
936 /* Check if the current composing sequence contains only valid codes.
937 If the composing sequence doesn't end before SRC_END, return -1.
938 Else, if it contains only valid codes, return 0.
939 Else return the length of the composing sequence. */
940
941 int check_composing_code (coding, src, src_end)
942 struct coding_system *coding;
943 unsigned char *src, *src_end;
944 {
945 unsigned char *src_start = src;
946 int invalid_code_found = 0;
947 int charset, c, c1, dim;
948
949 while (src < src_end)
950 {
951 if (*src++ != ISO_CODE_ESC) continue;
952 if (src >= src_end) break;
953 if ((c = *src++) == '1') /* end of compsition */
954 return (invalid_code_found ? src - src_start : 0);
955 if (src + 2 >= src_end) break;
956 if (!coding->flags & CODING_FLAG_ISO_DESIGNATION)
957 invalid_code_found = 1;
958 else
959 {
960 dim = 0;
961 if (c == '$')
962 {
963 dim = 1;
964 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
965 }
966 if (c >= '(' && c <= '/')
967 {
968 c1 = *src++;
969 if ((c1 < ' ' || c1 >= 0x80)
970 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
971 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
972 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
973 invalid_code_found = 1;
974 }
975 else
976 invalid_code_found = 1;
977 }
978 }
979 return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1);
980 }
981
982 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
983
984 int
985 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
986 struct coding_system *coding;
987 unsigned char *source, *destination;
988 int src_bytes, dst_bytes;
989 {
990 unsigned char *src = source;
991 unsigned char *src_end = source + src_bytes;
992 unsigned char *dst = destination;
993 unsigned char *dst_end = destination + dst_bytes;
994 /* Since the maximum bytes produced by each loop is 7, we subtract 6
995 from DST_END to assure that overflow checking is necessary only
996 at the head of loop. */
997 unsigned char *adjusted_dst_end = dst_end - 6;
998 int charset;
999 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1000 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1001 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1002 Lisp_Object unification_table
1003 = coding->character_unification_table_for_decode;
1004 int result = CODING_FINISH_NORMAL;
1005
1006 if (!NILP (Venable_character_unification) && NILP (unification_table))
1007 unification_table = Vstandard_character_unification_table_for_decode;
1008
1009 coding->produced_char = 0;
1010 while (src < src_end && (dst_bytes
1011 ? (dst < adjusted_dst_end)
1012 : (dst < src - 6)))
1013 {
1014 /* SRC_BASE remembers the start position in source in each loop.
1015 The loop will be exited when there's not enough source text
1016 to analyze long escape sequence or 2-byte code (within macros
1017 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1018 to SRC_BASE before exiting. */
1019 unsigned char *src_base = src;
1020 int c1 = *src++, c2;
1021
1022 switch (iso_code_class [c1])
1023 {
1024 case ISO_0x20_or_0x7F:
1025 if (!coding->composing
1026 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1027 {
1028 /* This is SPACE or DEL. */
1029 *dst++ = c1;
1030 coding->produced_char++;
1031 break;
1032 }
1033 /* This is a graphic character, we fall down ... */
1034
1035 case ISO_graphic_plane_0:
1036 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1037 {
1038 /* This is a composition rule. */
1039 *dst++ = c1 | 0x80;
1040 coding->composing = COMPOSING_WITH_RULE_TAIL;
1041 }
1042 else
1043 DECODE_ISO_CHARACTER (charset0, c1);
1044 break;
1045
1046 case ISO_0xA0_or_0xFF:
1047 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1048 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1049 {
1050 /* Invalid code. */
1051 *dst++ = c1;
1052 coding->produced_char++;
1053 break;
1054 }
1055 /* This is a graphic character, we fall down ... */
1056
1057 case ISO_graphic_plane_1:
1058 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1059 {
1060 /* Invalid code. */
1061 *dst++ = c1;
1062 coding->produced_char++;
1063 }
1064 else
1065 DECODE_ISO_CHARACTER (charset1, c1);
1066 break;
1067
1068 case ISO_control_code:
1069 /* All ISO2022 control characters in this class have the
1070 same representation in Emacs internal format. */
1071 if (c1 == '\n'
1072 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1073 && (coding->eol_type == CODING_EOL_CR
1074 || coding->eol_type == CODING_EOL_CRLF))
1075 {
1076 result = CODING_FINISH_INCONSISTENT_EOL;
1077 goto label_end_of_loop_2;
1078 }
1079 *dst++ = c1;
1080 coding->produced_char++;
1081 break;
1082
1083 case ISO_carriage_return:
1084 if (coding->eol_type == CODING_EOL_CR)
1085 *dst++ = '\n';
1086 else if (coding->eol_type == CODING_EOL_CRLF)
1087 {
1088 ONE_MORE_BYTE (c1);
1089 if (c1 == ISO_CODE_LF)
1090 *dst++ = '\n';
1091 else
1092 {
1093 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1094 {
1095 result = CODING_FINISH_INCONSISTENT_EOL;
1096 goto label_end_of_loop_2;
1097 }
1098 src--;
1099 *dst++ = '\r';
1100 }
1101 }
1102 else
1103 *dst++ = c1;
1104 coding->produced_char++;
1105 break;
1106
1107 case ISO_shift_out:
1108 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1109 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1110 goto label_invalid_code;
1111 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1112 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1113 break;
1114
1115 case ISO_shift_in:
1116 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1117 goto label_invalid_code;
1118 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1119 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1120 break;
1121
1122 case ISO_single_shift_2_7:
1123 case ISO_single_shift_2:
1124 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1125 goto label_invalid_code;
1126 /* SS2 is handled as an escape sequence of ESC 'N' */
1127 c1 = 'N';
1128 goto label_escape_sequence;
1129
1130 case ISO_single_shift_3:
1131 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1132 goto label_invalid_code;
1133 /* SS2 is handled as an escape sequence of ESC 'O' */
1134 c1 = 'O';
1135 goto label_escape_sequence;
1136
1137 case ISO_control_sequence_introducer:
1138 /* CSI is handled as an escape sequence of ESC '[' ... */
1139 c1 = '[';
1140 goto label_escape_sequence;
1141
1142 case ISO_escape:
1143 ONE_MORE_BYTE (c1);
1144 label_escape_sequence:
1145 /* Escape sequences handled by Emacs are invocation,
1146 designation, direction specification, and character
1147 composition specification. */
1148 switch (c1)
1149 {
1150 case '&': /* revision of following character set */
1151 ONE_MORE_BYTE (c1);
1152 if (!(c1 >= '@' && c1 <= '~'))
1153 goto label_invalid_code;
1154 ONE_MORE_BYTE (c1);
1155 if (c1 != ISO_CODE_ESC)
1156 goto label_invalid_code;
1157 ONE_MORE_BYTE (c1);
1158 goto label_escape_sequence;
1159
1160 case '$': /* designation of 2-byte character set */
1161 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1162 goto label_invalid_code;
1163 ONE_MORE_BYTE (c1);
1164 if (c1 >= '@' && c1 <= 'B')
1165 { /* designation of JISX0208.1978, GB2312.1980,
1166 or JISX0208.1980 */
1167 DECODE_DESIGNATION (0, 2, 94, c1);
1168 }
1169 else if (c1 >= 0x28 && c1 <= 0x2B)
1170 { /* designation of DIMENSION2_CHARS94 character set */
1171 ONE_MORE_BYTE (c2);
1172 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1173 }
1174 else if (c1 >= 0x2C && c1 <= 0x2F)
1175 { /* designation of DIMENSION2_CHARS96 character set */
1176 ONE_MORE_BYTE (c2);
1177 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1178 }
1179 else
1180 goto label_invalid_code;
1181 break;
1182
1183 case 'n': /* invocation of locking-shift-2 */
1184 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1185 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1186 goto label_invalid_code;
1187 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1188 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1189 break;
1190
1191 case 'o': /* invocation of locking-shift-3 */
1192 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1193 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1194 goto label_invalid_code;
1195 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1196 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1197 break;
1198
1199 case 'N': /* invocation of single-shift-2 */
1200 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1201 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1202 goto label_invalid_code;
1203 ONE_MORE_BYTE (c1);
1204 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1205 DECODE_ISO_CHARACTER (charset, c1);
1206 break;
1207
1208 case 'O': /* invocation of single-shift-3 */
1209 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1210 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1211 goto label_invalid_code;
1212 ONE_MORE_BYTE (c1);
1213 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1214 DECODE_ISO_CHARACTER (charset, c1);
1215 break;
1216
1217 case '0': case '2': /* start composing */
1218 /* Before processing composing, we must be sure that all
1219 characters being composed are supported by CODING.
1220 If not, we must give up composing and insert the
1221 bunch of codes for composing as is without decoding. */
1222 {
1223 int result1;
1224
1225 result1 = check_composing_code (coding, src, src_end);
1226 if (result1 == 0)
1227 coding->composing = (c1 == '0'
1228 ? COMPOSING_NO_RULE_HEAD
1229 : COMPOSING_WITH_RULE_HEAD);
1230 else if (result1 > 0)
1231 {
1232 if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst)
1233 {
1234 bcopy (src_base, dst, result1 + 2);
1235 src += result1;
1236 dst += result1 + 2;
1237 coding->produced_char += result1 + 2;
1238 }
1239 else
1240 {
1241 result = CODING_FINISH_INSUFFICIENT_DST;
1242 goto label_end_of_loop_2;
1243 }
1244 }
1245 else
1246 goto label_end_of_loop;
1247 }
1248 break;
1249
1250 case '1': /* end composing */
1251 coding->composing = COMPOSING_NO;
1252 coding->produced_char++;
1253 break;
1254
1255 case '[': /* specification of direction */
1256 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1257 goto label_invalid_code;
1258 /* For the moment, nested direction is not supported.
1259 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1260 left-to-right, and nozero means right-to-left. */
1261 ONE_MORE_BYTE (c1);
1262 switch (c1)
1263 {
1264 case ']': /* end of the current direction */
1265 coding->mode &= ~CODING_MODE_DIRECTION;
1266
1267 case '0': /* end of the current direction */
1268 case '1': /* start of left-to-right direction */
1269 ONE_MORE_BYTE (c1);
1270 if (c1 == ']')
1271 coding->mode &= ~CODING_MODE_DIRECTION;
1272 else
1273 goto label_invalid_code;
1274 break;
1275
1276 case '2': /* start of right-to-left direction */
1277 ONE_MORE_BYTE (c1);
1278 if (c1 == ']')
1279 coding->mode |= CODING_MODE_DIRECTION;
1280 else
1281 goto label_invalid_code;
1282 break;
1283
1284 default:
1285 goto label_invalid_code;
1286 }
1287 break;
1288
1289 default:
1290 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1291 goto label_invalid_code;
1292 if (c1 >= 0x28 && c1 <= 0x2B)
1293 { /* designation of DIMENSION1_CHARS94 character set */
1294 ONE_MORE_BYTE (c2);
1295 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1296 }
1297 else if (c1 >= 0x2C && c1 <= 0x2F)
1298 { /* designation of DIMENSION1_CHARS96 character set */
1299 ONE_MORE_BYTE (c2);
1300 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1301 }
1302 else
1303 {
1304 goto label_invalid_code;
1305 }
1306 }
1307 /* We must update these variables now. */
1308 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1309 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1310 break;
1311
1312 label_invalid_code:
1313 coding->produced_char += src - src_base;
1314 while (src_base < src)
1315 *dst++ = *src_base++;
1316 }
1317 continue;
1318
1319 label_end_of_loop:
1320 result = CODING_FINISH_INSUFFICIENT_SRC;
1321 label_end_of_loop_2:
1322 src = src_base;
1323 break;
1324 }
1325
1326 if (result == CODING_FINISH_NORMAL
1327 && src < src_end)
1328 result = CODING_FINISH_INSUFFICIENT_DST;
1329
1330 /* If this is the last block of the text to be decoded, we had
1331 better just flush out all remaining codes in the text although
1332 they are not valid characters. */
1333 if (coding->mode & CODING_MODE_LAST_BLOCK)
1334 {
1335 bcopy (src, dst, src_end - src);
1336 dst += (src_end - src);
1337 src = src_end;
1338 }
1339 coding->consumed = coding->consumed_char = src - source;
1340 coding->produced = dst - destination;
1341 return result;
1342 }
1343
1344 /* ISO2022 encoding stuff. */
1345
1346 /*
1347 It is not enough to say just "ISO2022" on encoding, we have to
1348 specify more details. In Emacs, each coding system of ISO2022
1349 variant has the following specifications:
1350 1. Initial designation to G0 thru G3.
1351 2. Allows short-form designation?
1352 3. ASCII should be designated to G0 before control characters?
1353 4. ASCII should be designated to G0 at end of line?
1354 5. 7-bit environment or 8-bit environment?
1355 6. Use locking-shift?
1356 7. Use Single-shift?
1357 And the following two are only for Japanese:
1358 8. Use ASCII in place of JIS0201-1976-Roman?
1359 9. Use JISX0208-1983 in place of JISX0208-1978?
1360 These specifications are encoded in `coding->flags' as flag bits
1361 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1362 details.
1363 */
1364
1365 /* Produce codes (escape sequence) for designating CHARSET to graphic
1366 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1367 the coding system CODING allows, produce designation sequence of
1368 short-form. */
1369
1370 #define ENCODE_DESIGNATION(charset, reg, coding) \
1371 do { \
1372 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1373 char *intermediate_char_94 = "()*+"; \
1374 char *intermediate_char_96 = ",-./"; \
1375 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1376 if (revision < 255) \
1377 { \
1378 *dst++ = ISO_CODE_ESC; \
1379 *dst++ = '&'; \
1380 *dst++ = '@' + revision; \
1381 } \
1382 *dst++ = ISO_CODE_ESC; \
1383 if (CHARSET_DIMENSION (charset) == 1) \
1384 { \
1385 if (CHARSET_CHARS (charset) == 94) \
1386 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1387 else \
1388 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1389 } \
1390 else \
1391 { \
1392 *dst++ = '$'; \
1393 if (CHARSET_CHARS (charset) == 94) \
1394 { \
1395 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1396 || reg != 0 \
1397 || final_char < '@' || final_char > 'B') \
1398 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1399 } \
1400 else \
1401 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1402 } \
1403 *dst++ = final_char; \
1404 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1405 } while (0)
1406
1407 /* The following two macros produce codes (control character or escape
1408 sequence) for ISO2022 single-shift functions (single-shift-2 and
1409 single-shift-3). */
1410
1411 #define ENCODE_SINGLE_SHIFT_2 \
1412 do { \
1413 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1414 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1415 else \
1416 *dst++ = ISO_CODE_SS2; \
1417 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1418 } while (0)
1419
1420 #define ENCODE_SINGLE_SHIFT_3 \
1421 do { \
1422 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1423 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1424 else \
1425 *dst++ = ISO_CODE_SS3; \
1426 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1427 } while (0)
1428
1429 /* The following four macros produce codes (control character or
1430 escape sequence) for ISO2022 locking-shift functions (shift-in,
1431 shift-out, locking-shift-2, and locking-shift-3). */
1432
1433 #define ENCODE_SHIFT_IN \
1434 do { \
1435 *dst++ = ISO_CODE_SI; \
1436 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1437 } while (0)
1438
1439 #define ENCODE_SHIFT_OUT \
1440 do { \
1441 *dst++ = ISO_CODE_SO; \
1442 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1443 } while (0)
1444
1445 #define ENCODE_LOCKING_SHIFT_2 \
1446 do { \
1447 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1448 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1449 } while (0)
1450
1451 #define ENCODE_LOCKING_SHIFT_3 \
1452 do { \
1453 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1454 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1455 } while (0)
1456
1457 /* Produce codes for a DIMENSION1 character whose character set is
1458 CHARSET and whose position-code is C1. Designation and invocation
1459 sequences are also produced in advance if necessary. */
1460
1461
1462 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1463 do { \
1464 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1465 { \
1466 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1467 *dst++ = c1 & 0x7F; \
1468 else \
1469 *dst++ = c1 | 0x80; \
1470 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1471 break; \
1472 } \
1473 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1474 { \
1475 *dst++ = c1 & 0x7F; \
1476 break; \
1477 } \
1478 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1479 { \
1480 *dst++ = c1 | 0x80; \
1481 break; \
1482 } \
1483 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1484 && !coding->safe_charsets[charset]) \
1485 { \
1486 /* We should not encode this character, instead produce one or \
1487 two `?'s. */ \
1488 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1489 if (CHARSET_WIDTH (charset) == 2) \
1490 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1491 break; \
1492 } \
1493 else \
1494 /* Since CHARSET is not yet invoked to any graphic planes, we \
1495 must invoke it, or, at first, designate it to some graphic \
1496 register. Then repeat the loop to actually produce the \
1497 character. */ \
1498 dst = encode_invocation_designation (charset, coding, dst); \
1499 } while (1)
1500
1501 /* Produce codes for a DIMENSION2 character whose character set is
1502 CHARSET and whose position-codes are C1 and C2. Designation and
1503 invocation codes are also produced in advance if necessary. */
1504
1505 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1506 do { \
1507 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1508 { \
1509 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1510 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1511 else \
1512 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1513 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1514 break; \
1515 } \
1516 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1517 { \
1518 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1519 break; \
1520 } \
1521 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1522 { \
1523 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1524 break; \
1525 } \
1526 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1527 && !coding->safe_charsets[charset]) \
1528 { \
1529 /* We should not encode this character, instead produce one or \
1530 two `?'s. */ \
1531 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1532 if (CHARSET_WIDTH (charset) == 2) \
1533 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1534 break; \
1535 } \
1536 else \
1537 /* Since CHARSET is not yet invoked to any graphic planes, we \
1538 must invoke it, or, at first, designate it to some graphic \
1539 register. Then repeat the loop to actually produce the \
1540 character. */ \
1541 dst = encode_invocation_designation (charset, coding, dst); \
1542 } while (1)
1543
1544 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1545 do { \
1546 int c_alt, charset_alt; \
1547 if (!NILP (unification_table) \
1548 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1549 >= 0)) \
1550 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1551 else \
1552 charset_alt = charset; \
1553 if (CHARSET_DIMENSION (charset_alt) == 1) \
1554 { \
1555 if (charset == CHARSET_ASCII \
1556 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1557 charset_alt = charset_latin_jisx0201; \
1558 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1559 } \
1560 else \
1561 { \
1562 if (charset == charset_jisx0208 \
1563 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1564 charset_alt = charset_jisx0208_1978; \
1565 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1566 } \
1567 if (! COMPOSING_P (coding->composing)) \
1568 coding->consumed_char++; \
1569 } while (0)
1570
1571 /* Produce designation and invocation codes at a place pointed by DST
1572 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1573 Return new DST. */
1574
1575 unsigned char *
1576 encode_invocation_designation (charset, coding, dst)
1577 int charset;
1578 struct coding_system *coding;
1579 unsigned char *dst;
1580 {
1581 int reg; /* graphic register number */
1582
1583 /* At first, check designations. */
1584 for (reg = 0; reg < 4; reg++)
1585 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1586 break;
1587
1588 if (reg >= 4)
1589 {
1590 /* CHARSET is not yet designated to any graphic registers. */
1591 /* At first check the requested designation. */
1592 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1593 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1594 /* Since CHARSET requests no special designation, designate it
1595 to graphic register 0. */
1596 reg = 0;
1597
1598 ENCODE_DESIGNATION (charset, reg, coding);
1599 }
1600
1601 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1602 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1603 {
1604 /* Since the graphic register REG is not invoked to any graphic
1605 planes, invoke it to graphic plane 0. */
1606 switch (reg)
1607 {
1608 case 0: /* graphic register 0 */
1609 ENCODE_SHIFT_IN;
1610 break;
1611
1612 case 1: /* graphic register 1 */
1613 ENCODE_SHIFT_OUT;
1614 break;
1615
1616 case 2: /* graphic register 2 */
1617 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1618 ENCODE_SINGLE_SHIFT_2;
1619 else
1620 ENCODE_LOCKING_SHIFT_2;
1621 break;
1622
1623 case 3: /* graphic register 3 */
1624 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1625 ENCODE_SINGLE_SHIFT_3;
1626 else
1627 ENCODE_LOCKING_SHIFT_3;
1628 break;
1629 }
1630 }
1631 return dst;
1632 }
1633
1634 /* The following two macros produce codes for indicating composition. */
1635 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1636 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1637 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1638
1639 /* The following three macros produce codes for indicating direction
1640 of text. */
1641 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1642 do { \
1643 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1644 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1645 else \
1646 *dst++ = ISO_CODE_CSI; \
1647 } while (0)
1648
1649 #define ENCODE_DIRECTION_R2L \
1650 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1651
1652 #define ENCODE_DIRECTION_L2R \
1653 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1654
1655 /* Produce codes for designation and invocation to reset the graphic
1656 planes and registers to initial state. */
1657 #define ENCODE_RESET_PLANE_AND_REGISTER \
1658 do { \
1659 int reg; \
1660 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1661 ENCODE_SHIFT_IN; \
1662 for (reg = 0; reg < 4; reg++) \
1663 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1664 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1665 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1666 ENCODE_DESIGNATION \
1667 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1668 } while (0)
1669
1670 /* Produce designation sequences of charsets in the line started from
1671 SRC to a place pointed by *DSTP, and update DSTP.
1672
1673 If the current block ends before any end-of-line, we may fail to
1674 find all the necessary designations. */
1675
1676 encode_designation_at_bol (coding, table, src, src_end, dstp)
1677 struct coding_system *coding;
1678 Lisp_Object table;
1679 unsigned char *src, *src_end, **dstp;
1680 {
1681 int charset, c, found = 0, reg;
1682 /* Table of charsets to be designated to each graphic register. */
1683 int r[4];
1684 unsigned char *dst = *dstp;
1685
1686 for (reg = 0; reg < 4; reg++)
1687 r[reg] = -1;
1688
1689 while (src < src_end && *src != '\n' && found < 4)
1690 {
1691 int bytes = BYTES_BY_CHAR_HEAD (*src);
1692
1693 if (NILP (table))
1694 charset = CHARSET_AT (src);
1695 else
1696 {
1697 int c_alt;
1698 unsigned char c1, c2;
1699
1700 SPLIT_STRING(src, bytes, charset, c1, c2);
1701 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1702 charset = CHAR_CHARSET (c_alt);
1703 }
1704
1705 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1706 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1707 {
1708 found++;
1709 r[reg] = charset;
1710 }
1711
1712 src += bytes;
1713 }
1714
1715 if (found)
1716 {
1717 for (reg = 0; reg < 4; reg++)
1718 if (r[reg] >= 0
1719 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1720 ENCODE_DESIGNATION (r[reg], reg, coding);
1721 *dstp = dst;
1722 }
1723 }
1724
1725 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1726
1727 int
1728 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1729 struct coding_system *coding;
1730 unsigned char *source, *destination;
1731 int src_bytes, dst_bytes;
1732 {
1733 unsigned char *src = source;
1734 unsigned char *src_end = source + src_bytes;
1735 unsigned char *dst = destination;
1736 unsigned char *dst_end = destination + dst_bytes;
1737 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1738 from DST_END to assure overflow checking is necessary only at the
1739 head of loop. */
1740 unsigned char *adjusted_dst_end = dst_end - 19;
1741 Lisp_Object unification_table
1742 = coding->character_unification_table_for_encode;
1743 int result = CODING_FINISH_NORMAL;
1744
1745 if (!NILP (Venable_character_unification) && NILP (unification_table))
1746 unification_table = Vstandard_character_unification_table_for_encode;
1747
1748 coding->consumed_char = 0;
1749 while (src < src_end && (dst_bytes
1750 ? (dst < adjusted_dst_end)
1751 : (dst < src - 19)))
1752 {
1753 /* SRC_BASE remembers the start position in source in each loop.
1754 The loop will be exited when there's not enough source text
1755 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1756 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1757 reset to SRC_BASE before exiting. */
1758 unsigned char *src_base = src;
1759 int charset, c1, c2, c3, c4;
1760
1761 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1762 && CODING_SPEC_ISO_BOL (coding))
1763 {
1764 /* We have to produce designation sequences if any now. */
1765 encode_designation_at_bol (coding, unification_table,
1766 src, src_end, &dst);
1767 CODING_SPEC_ISO_BOL (coding) = 0;
1768 }
1769
1770 c1 = *src++;
1771 /* If we are seeing a component of a composite character, we are
1772 seeing a leading-code encoded irregularly for composition, or
1773 a composition rule if composing with rule. We must set C1 to
1774 a normal leading-code or an ASCII code. If we are not seeing
1775 a composite character, we must reset composition,
1776 designation, and invocation states. */
1777 if (COMPOSING_P (coding->composing))
1778 {
1779 if (c1 < 0xA0)
1780 {
1781 /* We are not in a composite character any longer. */
1782 coding->composing = COMPOSING_NO;
1783 ENCODE_RESET_PLANE_AND_REGISTER;
1784 ENCODE_COMPOSITION_END;
1785 }
1786 else
1787 {
1788 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1789 {
1790 *dst++ = c1 & 0x7F;
1791 coding->composing = COMPOSING_WITH_RULE_HEAD;
1792 continue;
1793 }
1794 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1795 coding->composing = COMPOSING_WITH_RULE_RULE;
1796 if (c1 == 0xA0)
1797 {
1798 /* This is an ASCII component. */
1799 ONE_MORE_BYTE (c1);
1800 c1 &= 0x7F;
1801 }
1802 else
1803 /* This is a leading-code of non ASCII component. */
1804 c1 -= 0x20;
1805 }
1806 }
1807
1808 /* Now encode one character. C1 is a control character, an
1809 ASCII character, or a leading-code of multi-byte character. */
1810 switch (emacs_code_class[c1])
1811 {
1812 case EMACS_ascii_code:
1813 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1814 break;
1815
1816 case EMACS_control_code:
1817 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1818 ENCODE_RESET_PLANE_AND_REGISTER;
1819 *dst++ = c1;
1820 coding->consumed_char++;
1821 break;
1822
1823 case EMACS_carriage_return_code:
1824 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1825 {
1826 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1827 ENCODE_RESET_PLANE_AND_REGISTER;
1828 *dst++ = c1;
1829 coding->consumed_char++;
1830 break;
1831 }
1832 /* fall down to treat '\r' as '\n' ... */
1833
1834 case EMACS_linefeed_code:
1835 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1836 ENCODE_RESET_PLANE_AND_REGISTER;
1837 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1838 bcopy (coding->spec.iso2022.initial_designation,
1839 coding->spec.iso2022.current_designation,
1840 sizeof coding->spec.iso2022.initial_designation);
1841 if (coding->eol_type == CODING_EOL_LF
1842 || coding->eol_type == CODING_EOL_UNDECIDED)
1843 *dst++ = ISO_CODE_LF;
1844 else if (coding->eol_type == CODING_EOL_CRLF)
1845 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1846 else
1847 *dst++ = ISO_CODE_CR;
1848 CODING_SPEC_ISO_BOL (coding) = 1;
1849 coding->consumed_char++;
1850 break;
1851
1852 case EMACS_leading_code_2:
1853 ONE_MORE_BYTE (c2);
1854 if (c2 < 0xA0)
1855 {
1856 /* invalid sequence */
1857 *dst++ = c1;
1858 *dst++ = c2;
1859 coding->consumed_char += 2;
1860 }
1861 else
1862 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1863 break;
1864
1865 case EMACS_leading_code_3:
1866 TWO_MORE_BYTES (c2, c3);
1867 if (c2 < 0xA0 || c3 < 0xA0)
1868 {
1869 /* invalid sequence */
1870 *dst++ = c1;
1871 *dst++ = c2;
1872 *dst++ = c3;
1873 coding->consumed_char += 3;
1874 }
1875 else if (c1 < LEADING_CODE_PRIVATE_11)
1876 ENCODE_ISO_CHARACTER (c1, c2, c3);
1877 else
1878 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1879 break;
1880
1881 case EMACS_leading_code_4:
1882 THREE_MORE_BYTES (c2, c3, c4);
1883 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1884 {
1885 /* invalid sequence */
1886 *dst++ = c1;
1887 *dst++ = c2;
1888 *dst++ = c3;
1889 *dst++ = c4;
1890 coding->consumed_char += 4;
1891 }
1892 else
1893 ENCODE_ISO_CHARACTER (c2, c3, c4);
1894 break;
1895
1896 case EMACS_leading_code_composition:
1897 ONE_MORE_BYTE (c2);
1898 if (c2 < 0xA0)
1899 {
1900 /* invalid sequence */
1901 *dst++ = c1;
1902 *dst++ = c2;
1903 coding->consumed_char += 2;
1904 }
1905 else if (c2 == 0xFF)
1906 {
1907 ENCODE_RESET_PLANE_AND_REGISTER;
1908 coding->composing = COMPOSING_WITH_RULE_HEAD;
1909 ENCODE_COMPOSITION_WITH_RULE_START;
1910 coding->consumed_char++;
1911 }
1912 else
1913 {
1914 ENCODE_RESET_PLANE_AND_REGISTER;
1915 /* Rewind one byte because it is a character code of
1916 composition elements. */
1917 src--;
1918 coding->composing = COMPOSING_NO_RULE_HEAD;
1919 ENCODE_COMPOSITION_NO_RULE_START;
1920 coding->consumed_char++;
1921 }
1922 break;
1923
1924 case EMACS_invalid_code:
1925 *dst++ = c1;
1926 coding->consumed_char++;
1927 break;
1928 }
1929 continue;
1930 label_end_of_loop:
1931 result = CODING_FINISH_INSUFFICIENT_SRC;
1932 src = src_base;
1933 break;
1934 }
1935
1936 if (result == CODING_FINISH_NORMAL
1937 && src < src_end)
1938 result = CODING_FINISH_INSUFFICIENT_DST;
1939
1940 /* If this is the last block of the text to be encoded, we must
1941 reset graphic planes and registers to the initial state, and
1942 flush out the carryover if any. */
1943 if (coding->mode & CODING_MODE_LAST_BLOCK)
1944 ENCODE_RESET_PLANE_AND_REGISTER;
1945
1946 coding->consumed = src - source;
1947 coding->produced = coding->produced_char = dst - destination;
1948 return result;
1949 }
1950
1951 \f
1952 /*** 4. SJIS and BIG5 handlers ***/
1953
1954 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1955 quite widely. So, for the moment, Emacs supports them in the bare
1956 C code. But, in the future, they may be supported only by CCL. */
1957
1958 /* SJIS is a coding system encoding three character sets: ASCII, right
1959 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1960 as is. A character of charset katakana-jisx0201 is encoded by
1961 "position-code + 0x80". A character of charset japanese-jisx0208
1962 is encoded in 2-byte but two position-codes are divided and shifted
1963 so that it fit in the range below.
1964
1965 --- CODE RANGE of SJIS ---
1966 (character set) (range)
1967 ASCII 0x00 .. 0x7F
1968 KATAKANA-JISX0201 0xA0 .. 0xDF
1969 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1970 (2nd byte) 0x40 .. 0xFF
1971 -------------------------------
1972
1973 */
1974
1975 /* BIG5 is a coding system encoding two character sets: ASCII and
1976 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1977 character set and is encoded in two-byte.
1978
1979 --- CODE RANGE of BIG5 ---
1980 (character set) (range)
1981 ASCII 0x00 .. 0x7F
1982 Big5 (1st byte) 0xA1 .. 0xFE
1983 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1984 --------------------------
1985
1986 Since the number of characters in Big5 is larger than maximum
1987 characters in Emacs' charset (96x96), it can't be handled as one
1988 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1989 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1990 contains frequently used characters and the latter contains less
1991 frequently used characters. */
1992
1993 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1994 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1995 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1996 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1997
1998 /* Number of Big5 characters which have the same code in 1st byte. */
1999 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2000
2001 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2002 do { \
2003 unsigned int temp \
2004 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2005 if (b1 < 0xC9) \
2006 charset = charset_big5_1; \
2007 else \
2008 { \
2009 charset = charset_big5_2; \
2010 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2011 } \
2012 c1 = temp / (0xFF - 0xA1) + 0x21; \
2013 c2 = temp % (0xFF - 0xA1) + 0x21; \
2014 } while (0)
2015
2016 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2017 do { \
2018 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2019 if (charset == charset_big5_2) \
2020 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2021 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2022 b2 = temp % BIG5_SAME_ROW; \
2023 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2024 } while (0)
2025
2026 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2027 do { \
2028 int c_alt, charset_alt = (charset); \
2029 if (!NILP (unification_table) \
2030 && ((c_alt = unify_char (unification_table, \
2031 -1, (charset), c1, c2)) >= 0)) \
2032 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2033 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2034 DECODE_CHARACTER_ASCII (c1); \
2035 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2036 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2037 else \
2038 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2039 } while (0)
2040
2041 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2042 do { \
2043 int c_alt, charset_alt; \
2044 if (!NILP (unification_table) \
2045 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2046 >= 0)) \
2047 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2048 else \
2049 charset_alt = charset; \
2050 if (charset_alt == charset_ascii) \
2051 *dst++ = c1; \
2052 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2053 { \
2054 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2055 *dst++ = c1; \
2056 else \
2057 *dst++ = charset_alt, *dst++ = c1; \
2058 } \
2059 else \
2060 { \
2061 c1 &= 0x7F, c2 &= 0x7F; \
2062 if (sjis_p && charset_alt == charset_jisx0208) \
2063 { \
2064 unsigned char s1, s2; \
2065 \
2066 ENCODE_SJIS (c1, c2, s1, s2); \
2067 *dst++ = s1, *dst++ = s2; \
2068 } \
2069 else if (!sjis_p \
2070 && (charset_alt == charset_big5_1 \
2071 || charset_alt == charset_big5_2)) \
2072 { \
2073 unsigned char b1, b2; \
2074 \
2075 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2076 *dst++ = b1, *dst++ = b2; \
2077 } \
2078 else \
2079 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2080 } \
2081 coding->consumed_char++; \
2082 } while (0);
2083
2084 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2085 Check if a text is encoded in SJIS. If it is, return
2086 CODING_CATEGORY_MASK_SJIS, else return 0. */
2087
2088 int
2089 detect_coding_sjis (src, src_end)
2090 unsigned char *src, *src_end;
2091 {
2092 unsigned char c;
2093
2094 while (src < src_end)
2095 {
2096 c = *src++;
2097 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2098 {
2099 if (src < src_end && *src++ < 0x40)
2100 return 0;
2101 }
2102 }
2103 return CODING_CATEGORY_MASK_SJIS;
2104 }
2105
2106 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2107 Check if a text is encoded in BIG5. If it is, return
2108 CODING_CATEGORY_MASK_BIG5, else return 0. */
2109
2110 int
2111 detect_coding_big5 (src, src_end)
2112 unsigned char *src, *src_end;
2113 {
2114 unsigned char c;
2115
2116 while (src < src_end)
2117 {
2118 c = *src++;
2119 if (c >= 0xA1)
2120 {
2121 if (src >= src_end)
2122 break;
2123 c = *src++;
2124 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2125 return 0;
2126 }
2127 }
2128 return CODING_CATEGORY_MASK_BIG5;
2129 }
2130
2131 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2132 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2133
2134 int
2135 decode_coding_sjis_big5 (coding, source, destination,
2136 src_bytes, dst_bytes, sjis_p)
2137 struct coding_system *coding;
2138 unsigned char *source, *destination;
2139 int src_bytes, dst_bytes;
2140 int sjis_p;
2141 {
2142 unsigned char *src = source;
2143 unsigned char *src_end = source + src_bytes;
2144 unsigned char *dst = destination;
2145 unsigned char *dst_end = destination + dst_bytes;
2146 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2147 from DST_END to assure overflow checking is necessary only at the
2148 head of loop. */
2149 unsigned char *adjusted_dst_end = dst_end - 3;
2150 Lisp_Object unification_table
2151 = coding->character_unification_table_for_decode;
2152 int result = CODING_FINISH_NORMAL;
2153
2154 if (!NILP (Venable_character_unification) && NILP (unification_table))
2155 unification_table = Vstandard_character_unification_table_for_decode;
2156
2157 coding->produced_char = 0;
2158 while (src < src_end && (dst_bytes
2159 ? (dst < adjusted_dst_end)
2160 : (dst < src - 3)))
2161 {
2162 /* SRC_BASE remembers the start position in source in each loop.
2163 The loop will be exited when there's not enough source text
2164 to analyze two-byte character (within macro ONE_MORE_BYTE).
2165 In that case, SRC is reset to SRC_BASE before exiting. */
2166 unsigned char *src_base = src;
2167 unsigned char c1 = *src++, c2, c3, c4;
2168
2169 if (c1 < 0x20)
2170 {
2171 if (c1 == '\r')
2172 {
2173 if (coding->eol_type == CODING_EOL_CRLF)
2174 {
2175 ONE_MORE_BYTE (c2);
2176 if (c2 == '\n')
2177 *dst++ = c2;
2178 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2179 {
2180 result = CODING_FINISH_INCONSISTENT_EOL;
2181 goto label_end_of_loop_2;
2182 }
2183 else
2184 /* To process C2 again, SRC is subtracted by 1. */
2185 *dst++ = c1, src--;
2186 }
2187 else if (coding->eol_type == CODING_EOL_CR)
2188 *dst++ = '\n';
2189 else
2190 *dst++ = c1;
2191 }
2192 else if (c1 == '\n'
2193 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2194 && (coding->eol_type == CODING_EOL_CR
2195 || coding->eol_type == CODING_EOL_CRLF))
2196 {
2197 result = CODING_FINISH_INCONSISTENT_EOL;
2198 goto label_end_of_loop_2;
2199 }
2200 else
2201 *dst++ = c1;
2202 coding->produced_char++;
2203 }
2204 else if (c1 < 0x80)
2205 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2206 else if (c1 < 0xA0 || c1 >= 0xE0)
2207 {
2208 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
2209 if (sjis_p)
2210 {
2211 ONE_MORE_BYTE (c2);
2212 DECODE_SJIS (c1, c2, c3, c4);
2213 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2214 }
2215 else if (c1 >= 0xE0 && c1 < 0xFF)
2216 {
2217 int charset;
2218
2219 ONE_MORE_BYTE (c2);
2220 DECODE_BIG5 (c1, c2, charset, c3, c4);
2221 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2222 }
2223 else /* Invalid code */
2224 {
2225 *dst++ = c1;
2226 coding->produced_char++;
2227 }
2228 }
2229 else
2230 {
2231 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2232 if (sjis_p)
2233 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2234 /* dummy */ c2);
2235 else
2236 {
2237 int charset;
2238
2239 ONE_MORE_BYTE (c2);
2240 DECODE_BIG5 (c1, c2, charset, c3, c4);
2241 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2242 }
2243 }
2244 continue;
2245
2246 label_end_of_loop:
2247 result = CODING_FINISH_INSUFFICIENT_SRC;
2248 label_end_of_loop_2:
2249 src = src_base;
2250 break;
2251 }
2252
2253 if (result == CODING_FINISH_NORMAL
2254 && src < src_end)
2255 result = CODING_FINISH_INSUFFICIENT_DST;
2256
2257 coding->consumed = coding->consumed_char = src - source;
2258 coding->produced = dst - destination;
2259 return result;
2260 }
2261
2262 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2263 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2264 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2265 sure that all these charsets are registered as official charset
2266 (i.e. do not have extended leading-codes). Characters of other
2267 charsets are produced without any encoding. If SJIS_P is 1, encode
2268 SJIS text, else encode BIG5 text. */
2269
2270 int
2271 encode_coding_sjis_big5 (coding, source, destination,
2272 src_bytes, dst_bytes, sjis_p)
2273 struct coding_system *coding;
2274 unsigned char *source, *destination;
2275 int src_bytes, dst_bytes;
2276 int sjis_p;
2277 {
2278 unsigned char *src = source;
2279 unsigned char *src_end = source + src_bytes;
2280 unsigned char *dst = destination;
2281 unsigned char *dst_end = destination + dst_bytes;
2282 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2283 from DST_END to assure overflow checking is necessary only at the
2284 head of loop. */
2285 unsigned char *adjusted_dst_end = dst_end - 1;
2286 Lisp_Object unification_table
2287 = coding->character_unification_table_for_encode;
2288 int result = CODING_FINISH_NORMAL;
2289
2290 if (!NILP (Venable_character_unification) && NILP (unification_table))
2291 unification_table = Vstandard_character_unification_table_for_encode;
2292
2293 coding->consumed_char = 0;
2294 while (src < src_end && (dst_bytes
2295 ? (dst < adjusted_dst_end)
2296 : (dst < src - 1)))
2297 {
2298 /* SRC_BASE remembers the start position in source in each loop.
2299 The loop will be exited when there's not enough source text
2300 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2301 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2302 before exiting. */
2303 unsigned char *src_base = src;
2304 unsigned char c1 = *src++, c2, c3, c4;
2305
2306 if (coding->composing)
2307 {
2308 if (c1 == 0xA0)
2309 {
2310 ONE_MORE_BYTE (c1);
2311 c1 &= 0x7F;
2312 }
2313 else if (c1 >= 0xA0)
2314 c1 -= 0x20;
2315 else
2316 coding->composing = 0;
2317 }
2318
2319 switch (emacs_code_class[c1])
2320 {
2321 case EMACS_ascii_code:
2322 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2323 break;
2324
2325 case EMACS_control_code:
2326 *dst++ = c1;
2327 coding->consumed_char++;
2328 break;
2329
2330 case EMACS_carriage_return_code:
2331 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2332 {
2333 *dst++ = c1;
2334 coding->consumed_char++;
2335 break;
2336 }
2337 /* fall down to treat '\r' as '\n' ... */
2338
2339 case EMACS_linefeed_code:
2340 if (coding->eol_type == CODING_EOL_LF
2341 || coding->eol_type == CODING_EOL_UNDECIDED)
2342 *dst++ = '\n';
2343 else if (coding->eol_type == CODING_EOL_CRLF)
2344 *dst++ = '\r', *dst++ = '\n';
2345 else
2346 *dst++ = '\r';
2347 coding->consumed_char++;
2348 break;
2349
2350 case EMACS_leading_code_2:
2351 ONE_MORE_BYTE (c2);
2352 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2353 break;
2354
2355 case EMACS_leading_code_3:
2356 TWO_MORE_BYTES (c2, c3);
2357 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2358 break;
2359
2360 case EMACS_leading_code_4:
2361 THREE_MORE_BYTES (c2, c3, c4);
2362 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2363 break;
2364
2365 case EMACS_leading_code_composition:
2366 coding->composing = 1;
2367 break;
2368
2369 default: /* i.e. case EMACS_invalid_code: */
2370 *dst++ = c1;
2371 coding->consumed_char++;
2372 }
2373 continue;
2374
2375 label_end_of_loop:
2376 result = CODING_FINISH_INSUFFICIENT_SRC;
2377 src = src_base;
2378 break;
2379 }
2380
2381 if (result == CODING_FINISH_NORMAL
2382 && src < src_end)
2383 result = CODING_FINISH_INSUFFICIENT_DST;
2384 coding->consumed = src - source;
2385 coding->produced = coding->produced_char = dst - destination;
2386 return result;
2387 }
2388
2389 \f
2390 /*** 5. End-of-line handlers ***/
2391
2392 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2393 This function is called only when `coding->eol_type' is
2394 CODING_EOL_CRLF or CODING_EOL_CR. */
2395
2396 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2397 struct coding_system *coding;
2398 unsigned char *source, *destination;
2399 int src_bytes, dst_bytes;
2400 {
2401 unsigned char *src = source;
2402 unsigned char *src_end = source + src_bytes;
2403 unsigned char *dst = destination;
2404 unsigned char *dst_end = destination + dst_bytes;
2405 int result = CODING_FINISH_NORMAL;
2406
2407 if (src_bytes <= 0)
2408 return result;
2409
2410 switch (coding->eol_type)
2411 {
2412 case CODING_EOL_CRLF:
2413 {
2414 /* Since the maximum bytes produced by each loop is 2, we
2415 subtract 1 from DST_END to assure overflow checking is
2416 necessary only at the head of loop. */
2417 unsigned char *adjusted_dst_end = dst_end - 1;
2418
2419 while (src < src_end && (dst_bytes
2420 ? (dst < adjusted_dst_end)
2421 : (dst < src - 1)))
2422 {
2423 unsigned char *src_base = src;
2424 unsigned char c = *src++;
2425 if (c == '\r')
2426 {
2427 ONE_MORE_BYTE (c);
2428 if (c != '\n')
2429 {
2430 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2431 {
2432 result = CODING_FINISH_INCONSISTENT_EOL;
2433 goto label_end_of_loop_2;
2434 }
2435 *dst++ = '\r';
2436 }
2437 *dst++ = c;
2438 }
2439 else if (c == '\n'
2440 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2441 {
2442 result = CODING_FINISH_INCONSISTENT_EOL;
2443 goto label_end_of_loop_2;
2444 }
2445 else
2446 *dst++ = c;
2447 continue;
2448
2449 label_end_of_loop:
2450 result = CODING_FINISH_INSUFFICIENT_SRC;
2451 label_end_of_loop_2:
2452 src = src_base;
2453 break;
2454 }
2455 if (result == CODING_FINISH_NORMAL
2456 && src < src_end)
2457 result = CODING_FINISH_INSUFFICIENT_DST;
2458 }
2459 break;
2460
2461 case CODING_EOL_CR:
2462 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2463 {
2464 while (src < src_end) if (*src++ == '\n') break;
2465 if (*--src == '\n')
2466 {
2467 src_bytes = src - source;
2468 result = CODING_FINISH_INCONSISTENT_EOL;
2469 }
2470 }
2471 if (dst_bytes && src_bytes > dst_bytes)
2472 {
2473 result = CODING_FINISH_INSUFFICIENT_DST;
2474 src_bytes = dst_bytes;
2475 }
2476 if (dst_bytes)
2477 bcopy (source, destination, src_bytes);
2478 else
2479 safe_bcopy (source, destination, src_bytes);
2480 src = source + src_bytes;
2481 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2482 break;
2483
2484 default: /* i.e. case: CODING_EOL_LF */
2485 if (dst_bytes && src_bytes > dst_bytes)
2486 {
2487 result = CODING_FINISH_INSUFFICIENT_DST;
2488 src_bytes = dst_bytes;
2489 }
2490 if (dst_bytes)
2491 bcopy (source, destination, src_bytes);
2492 else
2493 safe_bcopy (source, destination, src_bytes);
2494 src += src_bytes;
2495 dst += dst_bytes;
2496 break;
2497 }
2498
2499 coding->consumed = coding->consumed_char = src - source;
2500 coding->produced = coding->produced_char = dst - destination;
2501 return result;
2502 }
2503
2504 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2505 format of end-of-line according to `coding->eol_type'. If
2506 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2507 '\r' in source text also means end-of-line. */
2508
2509 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2510 struct coding_system *coding;
2511 unsigned char *source, *destination;
2512 int src_bytes, dst_bytes;
2513 {
2514 unsigned char *src = source;
2515 unsigned char *dst = destination;
2516 int result = CODING_FINISH_NORMAL;
2517
2518 if (coding->eol_type == CODING_EOL_CRLF)
2519 {
2520 unsigned char c;
2521 unsigned char *src_end = source + src_bytes;
2522 unsigned char *dst_end = destination + dst_bytes;
2523 /* Since the maximum bytes produced by each loop is 2, we
2524 subtract 1 from DST_END to assure overflow checking is
2525 necessary only at the head of loop. */
2526 unsigned char *adjusted_dst_end = dst_end - 1;
2527
2528 while (src < src_end && (dst_bytes
2529 ? (dst < adjusted_dst_end)
2530 : (dst < src - 1)))
2531 {
2532 c = *src++;
2533 if (c == '\n'
2534 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2535 *dst++ = '\r', *dst++ = '\n';
2536 else
2537 *dst++ = c;
2538 }
2539 if (src < src_end)
2540 result = CODING_FINISH_INSUFFICIENT_DST;
2541 }
2542 else
2543 {
2544 if (dst_bytes && src_bytes > dst_bytes)
2545 {
2546 src_bytes = dst_bytes;
2547 result = CODING_FINISH_INSUFFICIENT_DST;
2548 }
2549 if (dst_bytes)
2550 bcopy (source, destination, src_bytes);
2551 else
2552 safe_bcopy (source, destination, src_bytes);
2553 if (coding->eol_type == CODING_EOL_CRLF)
2554 {
2555 while (src_bytes--)
2556 if (*dst++ == '\n') dst[-1] = '\r';
2557 }
2558 else if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2559 {
2560 while (src_bytes--)
2561 if (*dst++ == '\r') dst[-1] = '\n';
2562 }
2563 src += src_bytes;
2564 dst += src_bytes;
2565 }
2566
2567 coding->consumed = coding->consumed_char = src - source;
2568 coding->produced = coding->produced_char = dst - destination;
2569 return result;
2570 }
2571
2572 \f
2573 /*** 6. C library functions ***/
2574
2575 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2576 has a property `coding-system'. The value of this property is a
2577 vector of length 5 (called as coding-vector). Among elements of
2578 this vector, the first (element[0]) and the fifth (element[4])
2579 carry important information for decoding/encoding. Before
2580 decoding/encoding, this information should be set in fields of a
2581 structure of type `coding_system'.
2582
2583 A value of property `coding-system' can be a symbol of another
2584 subsidiary coding-system. In that case, Emacs gets coding-vector
2585 from that symbol.
2586
2587 `element[0]' contains information to be set in `coding->type'. The
2588 value and its meaning is as follows:
2589
2590 0 -- coding_type_emacs_mule
2591 1 -- coding_type_sjis
2592 2 -- coding_type_iso2022
2593 3 -- coding_type_big5
2594 4 -- coding_type_ccl encoder/decoder written in CCL
2595 nil -- coding_type_no_conversion
2596 t -- coding_type_undecided (automatic conversion on decoding,
2597 no-conversion on encoding)
2598
2599 `element[4]' contains information to be set in `coding->flags' and
2600 `coding->spec'. The meaning varies by `coding->type'.
2601
2602 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2603 of length 32 (of which the first 13 sub-elements are used now).
2604 Meanings of these sub-elements are:
2605
2606 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2607 If the value is an integer of valid charset, the charset is
2608 assumed to be designated to graphic register N initially.
2609
2610 If the value is minus, it is a minus value of charset which
2611 reserves graphic register N, which means that the charset is
2612 not designated initially but should be designated to graphic
2613 register N just before encoding a character in that charset.
2614
2615 If the value is nil, graphic register N is never used on
2616 encoding.
2617
2618 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2619 Each value takes t or nil. See the section ISO2022 of
2620 `coding.h' for more information.
2621
2622 If `coding->type' is `coding_type_big5', element[4] is t to denote
2623 BIG5-ETen or nil to denote BIG5-HKU.
2624
2625 If `coding->type' takes the other value, element[4] is ignored.
2626
2627 Emacs Lisp's coding system also carries information about format of
2628 end-of-line in a value of property `eol-type'. If the value is
2629 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2630 means CODING_EOL_CR. If it is not integer, it should be a vector
2631 of subsidiary coding systems of which property `eol-type' has one
2632 of above values.
2633
2634 */
2635
2636 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2637 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2638 is setup so that no conversion is necessary and return -1, else
2639 return 0. */
2640
2641 int
2642 setup_coding_system (coding_system, coding)
2643 Lisp_Object coding_system;
2644 struct coding_system *coding;
2645 {
2646 Lisp_Object coding_spec, coding_type, eol_type, plist;
2647 Lisp_Object val;
2648 int i;
2649
2650 /* Initialize some fields required for all kinds of coding systems. */
2651 coding->symbol = coding_system;
2652 coding->common_flags = 0;
2653 coding->mode = 0;
2654 coding->heading_ascii = -1;
2655 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2656 coding_spec = Fget (coding_system, Qcoding_system);
2657 if (!VECTORP (coding_spec)
2658 || XVECTOR (coding_spec)->size != 5
2659 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2660 goto label_invalid_coding_system;
2661
2662 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2663 if (VECTORP (eol_type))
2664 {
2665 coding->eol_type = CODING_EOL_UNDECIDED;
2666 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2667 }
2668 else if (XFASTINT (eol_type) == 1)
2669 {
2670 coding->eol_type = CODING_EOL_CRLF;
2671 coding->common_flags
2672 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2673 }
2674 else if (XFASTINT (eol_type) == 2)
2675 {
2676 coding->eol_type = CODING_EOL_CR;
2677 coding->common_flags
2678 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2679 }
2680 else
2681 coding->eol_type = CODING_EOL_LF;
2682
2683 coding_type = XVECTOR (coding_spec)->contents[0];
2684 /* Try short cut. */
2685 if (SYMBOLP (coding_type))
2686 {
2687 if (EQ (coding_type, Qt))
2688 {
2689 coding->type = coding_type_undecided;
2690 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2691 }
2692 else
2693 coding->type = coding_type_no_conversion;
2694 return 0;
2695 }
2696
2697 /* Initialize remaining fields. */
2698 coding->composing = 0;
2699 coding->character_unification_table_for_decode = Qnil;
2700 coding->character_unification_table_for_encode = Qnil;
2701
2702 /* Get values of coding system properties:
2703 `post-read-conversion', `pre-write-conversion',
2704 `character-unification-table-for-decode',
2705 `character-unification-table-for-encode'. */
2706 plist = XVECTOR (coding_spec)->contents[3];
2707 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2708 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2709 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2710 if (SYMBOLP (val))
2711 val = Fget (val, Qcharacter_unification_table_for_decode);
2712 coding->character_unification_table_for_decode
2713 = CHAR_TABLE_P (val) ? val : Qnil;
2714 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2715 if (SYMBOLP (val))
2716 val = Fget (val, Qcharacter_unification_table_for_encode);
2717 coding->character_unification_table_for_encode
2718 = CHAR_TABLE_P (val) ? val : Qnil;
2719 val = Fplist_get (plist, Qcoding_category);
2720 if (!NILP (val))
2721 {
2722 val = Fget (val, Qcoding_category_index);
2723 if (INTEGERP (val))
2724 coding->category_idx = XINT (val);
2725 else
2726 goto label_invalid_coding_system;
2727 }
2728 else
2729 goto label_invalid_coding_system;
2730
2731 val = Fplist_get (plist, Qsafe_charsets);
2732 if (EQ (val, Qt))
2733 {
2734 for (i = 0; i <= MAX_CHARSET; i++)
2735 coding->safe_charsets[i] = 1;
2736 }
2737 else
2738 {
2739 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2740 while (CONSP (val))
2741 {
2742 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2743 coding->safe_charsets[i] = 1;
2744 val = XCONS (val)->cdr;
2745 }
2746 }
2747
2748 switch (XFASTINT (coding_type))
2749 {
2750 case 0:
2751 coding->type = coding_type_emacs_mule;
2752 if (!NILP (coding->post_read_conversion))
2753 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2754 if (!NILP (coding->pre_write_conversion))
2755 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2756 break;
2757
2758 case 1:
2759 coding->type = coding_type_sjis;
2760 coding->common_flags
2761 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2762 break;
2763
2764 case 2:
2765 coding->type = coding_type_iso2022;
2766 coding->common_flags
2767 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2768 {
2769 Lisp_Object val, temp;
2770 Lisp_Object *flags;
2771 int i, charset, reg_bits = 0;
2772
2773 val = XVECTOR (coding_spec)->contents[4];
2774
2775 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2776 goto label_invalid_coding_system;
2777
2778 flags = XVECTOR (val)->contents;
2779 coding->flags
2780 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2781 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2782 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2783 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2784 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2785 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2786 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2787 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2788 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2789 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2790 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2791 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2792 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2793 );
2794
2795 /* Invoke graphic register 0 to plane 0. */
2796 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2797 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2798 CODING_SPEC_ISO_INVOCATION (coding, 1)
2799 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2800 /* Not single shifting at first. */
2801 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2802 /* Beginning of buffer should also be regarded as bol. */
2803 CODING_SPEC_ISO_BOL (coding) = 1;
2804
2805 for (charset = 0; charset <= MAX_CHARSET; charset++)
2806 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2807 val = Vcharset_revision_alist;
2808 while (CONSP (val))
2809 {
2810 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2811 if (charset >= 0
2812 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2813 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2814 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2815 val = XCONS (val)->cdr;
2816 }
2817
2818 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2819 FLAGS[REG] can be one of below:
2820 integer CHARSET: CHARSET occupies register I,
2821 t: designate nothing to REG initially, but can be used
2822 by any charsets,
2823 list of integer, nil, or t: designate the first
2824 element (if integer) to REG initially, the remaining
2825 elements (if integer) is designated to REG on request,
2826 if an element is t, REG can be used by any charsets,
2827 nil: REG is never used. */
2828 for (charset = 0; charset <= MAX_CHARSET; charset++)
2829 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2830 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2831 for (i = 0; i < 4; i++)
2832 {
2833 if (INTEGERP (flags[i])
2834 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2835 || (charset = get_charset_id (flags[i])) >= 0)
2836 {
2837 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2838 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2839 }
2840 else if (EQ (flags[i], Qt))
2841 {
2842 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2843 reg_bits |= 1 << i;
2844 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2845 }
2846 else if (CONSP (flags[i]))
2847 {
2848 Lisp_Object tail = flags[i];
2849
2850 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2851 if (INTEGERP (XCONS (tail)->car)
2852 && (charset = XINT (XCONS (tail)->car),
2853 CHARSET_VALID_P (charset))
2854 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2855 {
2856 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2857 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2858 }
2859 else
2860 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2861 tail = XCONS (tail)->cdr;
2862 while (CONSP (tail))
2863 {
2864 if (INTEGERP (XCONS (tail)->car)
2865 && (charset = XINT (XCONS (tail)->car),
2866 CHARSET_VALID_P (charset))
2867 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2868 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2869 = i;
2870 else if (EQ (XCONS (tail)->car, Qt))
2871 reg_bits |= 1 << i;
2872 tail = XCONS (tail)->cdr;
2873 }
2874 }
2875 else
2876 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2877
2878 CODING_SPEC_ISO_DESIGNATION (coding, i)
2879 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2880 }
2881
2882 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2883 {
2884 /* REG 1 can be used only by locking shift in 7-bit env. */
2885 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2886 reg_bits &= ~2;
2887 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2888 /* Without any shifting, only REG 0 and 1 can be used. */
2889 reg_bits &= 3;
2890 }
2891
2892 if (reg_bits)
2893 for (charset = 0; charset <= MAX_CHARSET; charset++)
2894 {
2895 if (CHARSET_VALID_P (charset))
2896 {
2897 /* There exist some default graphic registers to be
2898 used CHARSET. */
2899
2900 /* We had better avoid designating a charset of
2901 CHARS96 to REG 0 as far as possible. */
2902 if (CHARSET_CHARS (charset) == 96)
2903 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2904 = (reg_bits & 2
2905 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
2906 else
2907 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2908 = (reg_bits & 1
2909 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2910 }
2911 }
2912 }
2913 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2914 coding->spec.iso2022.last_invalid_designation_register = -1;
2915 break;
2916
2917 case 3:
2918 coding->type = coding_type_big5;
2919 coding->common_flags
2920 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2921 coding->flags
2922 = (NILP (XVECTOR (coding_spec)->contents[4])
2923 ? CODING_FLAG_BIG5_HKU
2924 : CODING_FLAG_BIG5_ETEN);
2925 break;
2926
2927 case 4:
2928 coding->type = coding_type_ccl;
2929 coding->common_flags
2930 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2931 {
2932 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
2933 if (CONSP (val)
2934 && VECTORP (XCONS (val)->car)
2935 && VECTORP (XCONS (val)->cdr))
2936 {
2937 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2938 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2939 }
2940 else
2941 goto label_invalid_coding_system;
2942 }
2943 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2944 break;
2945
2946 case 5:
2947 coding->type = coding_type_raw_text;
2948 break;
2949
2950 default:
2951 goto label_invalid_coding_system;
2952 }
2953 return 0;
2954
2955 label_invalid_coding_system:
2956 coding->type = coding_type_no_conversion;
2957 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
2958 coding->common_flags = 0;
2959 coding->eol_type = CODING_EOL_LF;
2960 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
2961 return -1;
2962 }
2963
2964 /* Emacs has a mechanism to automatically detect a coding system if it
2965 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2966 it's impossible to distinguish some coding systems accurately
2967 because they use the same range of codes. So, at first, coding
2968 systems are categorized into 7, those are:
2969
2970 o coding-category-emacs-mule
2971
2972 The category for a coding system which has the same code range
2973 as Emacs' internal format. Assigned the coding-system (Lisp
2974 symbol) `emacs-mule' by default.
2975
2976 o coding-category-sjis
2977
2978 The category for a coding system which has the same code range
2979 as SJIS. Assigned the coding-system (Lisp
2980 symbol) `japanese-shift-jis' by default.
2981
2982 o coding-category-iso-7
2983
2984 The category for a coding system which has the same code range
2985 as ISO2022 of 7-bit environment. This doesn't use any locking
2986 shift and single shift functions. This can encode/decode all
2987 charsets. Assigned the coding-system (Lisp symbol)
2988 `iso-2022-7bit' by default.
2989
2990 o coding-category-iso-7-tight
2991
2992 Same as coding-category-iso-7 except that this can
2993 encode/decode only the specified charsets.
2994
2995 o coding-category-iso-8-1
2996
2997 The category for a coding system which has the same code range
2998 as ISO2022 of 8-bit environment and graphic plane 1 used only
2999 for DIMENSION1 charset. This doesn't use any locking shift
3000 and single shift functions. Assigned the coding-system (Lisp
3001 symbol) `iso-latin-1' by default.
3002
3003 o coding-category-iso-8-2
3004
3005 The category for a coding system which has the same code range
3006 as ISO2022 of 8-bit environment and graphic plane 1 used only
3007 for DIMENSION2 charset. This doesn't use any locking shift
3008 and single shift functions. Assigned the coding-system (Lisp
3009 symbol) `japanese-iso-8bit' by default.
3010
3011 o coding-category-iso-7-else
3012
3013 The category for a coding system which has the same code range
3014 as ISO2022 of 7-bit environemnt but uses locking shift or
3015 single shift functions. Assigned the coding-system (Lisp
3016 symbol) `iso-2022-7bit-lock' by default.
3017
3018 o coding-category-iso-8-else
3019
3020 The category for a coding system which has the same code range
3021 as ISO2022 of 8-bit environemnt but uses locking shift or
3022 single shift functions. Assigned the coding-system (Lisp
3023 symbol) `iso-2022-8bit-ss2' by default.
3024
3025 o coding-category-big5
3026
3027 The category for a coding system which has the same code range
3028 as BIG5. Assigned the coding-system (Lisp symbol)
3029 `cn-big5' by default.
3030
3031 o coding-category-binary
3032
3033 The category for a coding system not categorized in any of the
3034 above. Assigned the coding-system (Lisp symbol)
3035 `no-conversion' by default.
3036
3037 Each of them is a Lisp symbol and the value is an actual
3038 `coding-system's (this is also a Lisp symbol) assigned by a user.
3039 What Emacs does actually is to detect a category of coding system.
3040 Then, it uses a `coding-system' assigned to it. If Emacs can't
3041 decide only one possible category, it selects a category of the
3042 highest priority. Priorities of categories are also specified by a
3043 user in a Lisp variable `coding-category-list'.
3044
3045 */
3046
3047 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3048 If it detects possible coding systems, return an integer in which
3049 appropriate flag bits are set. Flag bits are defined by macros
3050 CODING_CATEGORY_MASK_XXX in `coding.h'.
3051
3052 How many ASCII characters are at the head is returned as *SKIP. */
3053
3054 static int
3055 detect_coding_mask (source, src_bytes, priorities, skip)
3056 unsigned char *source;
3057 int src_bytes, *priorities, *skip;
3058 {
3059 register unsigned char c;
3060 unsigned char *src = source, *src_end = source + src_bytes;
3061 unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT
3062 | CODING_CATEGORY_MASK_ISO_SHIFT);
3063 int i;
3064
3065 /* At first, skip all ASCII characters and control characters except
3066 for three ISO2022 specific control characters. */
3067 label_loop_detect_coding:
3068 while (src < src_end)
3069 {
3070 c = *src;
3071 if (c >= 0x80
3072 || ((mask & CODING_CATEGORY_MASK_ISO_7BIT)
3073 && c == ISO_CODE_ESC)
3074 || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT)
3075 && (c == ISO_CODE_SI || c == ISO_CODE_SO)))
3076 break;
3077 src++;
3078 }
3079 *skip = src - source;
3080
3081 if (src >= src_end)
3082 /* We found nothing other than ASCII. There's nothing to do. */
3083 return 0;
3084
3085 /* The text seems to be encoded in some multilingual coding system.
3086 Now, try to find in which coding system the text is encoded. */
3087 if (c < 0x80)
3088 {
3089 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3090 /* C is an ISO2022 specific control code of C0. */
3091 mask = detect_coding_iso2022 (src, src_end);
3092 if (mask == 0)
3093 {
3094 /* No valid ISO2022 code follows C. Try again. */
3095 src++;
3096 mask = (c != ISO_CODE_ESC
3097 ? CODING_CATEGORY_MASK_ISO_7BIT
3098 : CODING_CATEGORY_MASK_ISO_SHIFT);
3099 goto label_loop_detect_coding;
3100 }
3101 if (priorities)
3102 goto label_return_highest_only;
3103 }
3104 else
3105 {
3106 int try;
3107
3108 if (c < 0xA0)
3109 {
3110 /* C is the first byte of SJIS character code,
3111 or a leading-code of Emacs' internal format (emacs-mule). */
3112 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3113
3114 /* Or, if C is a special latin extra code,
3115 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3116 or is an ISO2022 control-sequence-introducer (CSI),
3117 we should also consider the possibility of ISO2022 codings. */
3118 if ((VECTORP (Vlatin_extra_code_table)
3119 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3120 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3121 || (c == ISO_CODE_CSI
3122 && (src < src_end
3123 && (*src == ']'
3124 || ((*src == '0' || *src == '1' || *src == '2')
3125 && src + 1 < src_end
3126 && src[1] == ']')))))
3127 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3128 | CODING_CATEGORY_MASK_ISO_8BIT);
3129 }
3130 else
3131 /* C is a character of ISO2022 in graphic plane right,
3132 or a SJIS's 1-byte character code (i.e. JISX0201),
3133 or the first byte of BIG5's 2-byte code. */
3134 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3135 | CODING_CATEGORY_MASK_ISO_8BIT
3136 | CODING_CATEGORY_MASK_SJIS
3137 | CODING_CATEGORY_MASK_BIG5);
3138
3139 mask = 0;
3140 if (priorities)
3141 {
3142 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3143 {
3144 priorities[i] &= try;
3145 if (priorities[i] & CODING_CATEGORY_MASK_ISO)
3146 mask = detect_coding_iso2022 (src, src_end);
3147 else if (priorities[i] & CODING_CATEGORY_MASK_SJIS)
3148 mask = detect_coding_sjis (src, src_end);
3149 else if (priorities[i] & CODING_CATEGORY_MASK_BIG5)
3150 mask = detect_coding_big5 (src, src_end);
3151 else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE)
3152 mask = detect_coding_emacs_mule (src, src_end);
3153 if (mask)
3154 goto label_return_highest_only;
3155 }
3156 return CODING_CATEGORY_MASK_RAW_TEXT;
3157 }
3158 if (try & CODING_CATEGORY_MASK_ISO)
3159 mask |= detect_coding_iso2022 (src, src_end);
3160 if (try & CODING_CATEGORY_MASK_SJIS)
3161 mask |= detect_coding_sjis (src, src_end);
3162 if (try & CODING_CATEGORY_MASK_BIG5)
3163 mask |= detect_coding_big5 (src, src_end);
3164 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3165 mask |= detect_coding_emacs_mule (src, src_end);
3166 }
3167 return (mask | CODING_CATEGORY_MASK_RAW_TEXT);
3168
3169 label_return_highest_only:
3170 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3171 {
3172 if (mask & priorities[i])
3173 return priorities[i];
3174 }
3175 return CODING_CATEGORY_MASK_RAW_TEXT;
3176 }
3177
3178 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3179 The information of the detected coding system is set in CODING. */
3180
3181 void
3182 detect_coding (coding, src, src_bytes)
3183 struct coding_system *coding;
3184 unsigned char *src;
3185 int src_bytes;
3186 {
3187 unsigned int idx;
3188 int skip, mask, i;
3189 int priorities[CODING_CATEGORY_IDX_MAX];
3190 Lisp_Object val = Vcoding_category_list;
3191
3192 i = 0;
3193 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
3194 {
3195 if (! SYMBOLP (XCONS (val)->car))
3196 break;
3197 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
3198 if (idx >= CODING_CATEGORY_IDX_MAX)
3199 break;
3200 priorities[i++] = (1 << idx);
3201 val = XCONS (val)->cdr;
3202 }
3203 /* If coding-category-list is valid and contains all coding
3204 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3205 the following code saves Emacs from craching. */
3206 while (i < CODING_CATEGORY_IDX_MAX)
3207 priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
3208
3209 mask = detect_coding_mask (src, src_bytes, priorities, &skip);
3210 coding->heading_ascii = skip;
3211
3212 if (!mask) return;
3213
3214 /* We found a single coding system of the highest priority in MASK. */
3215 idx = 0;
3216 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3217 if (! mask)
3218 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3219
3220 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3221
3222 if (coding->eol_type != CODING_EOL_UNDECIDED)
3223 {
3224 Lisp_Object tmp = Fget (val, Qeol_type);
3225
3226 if (VECTORP (tmp))
3227 val = XVECTOR (tmp)->contents[coding->eol_type];
3228 }
3229 setup_coding_system (val, coding);
3230 /* Set this again because setup_coding_system reset this member. */
3231 coding->heading_ascii = skip;
3232 }
3233
3234 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3235 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3236 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3237
3238 How many non-eol characters are at the head is returned as *SKIP. */
3239
3240 #define MAX_EOL_CHECK_COUNT 3
3241
3242 static int
3243 detect_eol_type (source, src_bytes, skip)
3244 unsigned char *source;
3245 int src_bytes, *skip;
3246 {
3247 unsigned char *src = source, *src_end = src + src_bytes;
3248 unsigned char c;
3249 int total = 0; /* How many end-of-lines are found so far. */
3250 int eol_type = CODING_EOL_UNDECIDED;
3251 int this_eol_type;
3252
3253 *skip = 0;
3254
3255 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3256 {
3257 c = *src++;
3258 if (c == '\n' || c == '\r')
3259 {
3260 if (*skip == 0)
3261 *skip = src - 1 - source;
3262 total++;
3263 if (c == '\n')
3264 this_eol_type = CODING_EOL_LF;
3265 else if (src >= src_end || *src != '\n')
3266 this_eol_type = CODING_EOL_CR;
3267 else
3268 this_eol_type = CODING_EOL_CRLF, src++;
3269
3270 if (eol_type == CODING_EOL_UNDECIDED)
3271 /* This is the first end-of-line. */
3272 eol_type = this_eol_type;
3273 else if (eol_type != this_eol_type)
3274 {
3275 /* The found type is different from what found before. */
3276 eol_type = CODING_EOL_INCONSISTENT;
3277 break;
3278 }
3279 }
3280 }
3281
3282 if (*skip == 0)
3283 *skip = src_end - source;
3284 return eol_type;
3285 }
3286
3287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3288 is encoded. If it detects an appropriate format of end-of-line, it
3289 sets the information in *CODING. */
3290
3291 void
3292 detect_eol (coding, src, src_bytes)
3293 struct coding_system *coding;
3294 unsigned char *src;
3295 int src_bytes;
3296 {
3297 Lisp_Object val;
3298 int skip;
3299 int eol_type = detect_eol_type (src, src_bytes, &skip);
3300
3301 if (coding->heading_ascii > skip)
3302 coding->heading_ascii = skip;
3303 else
3304 skip = coding->heading_ascii;
3305
3306 if (eol_type == CODING_EOL_UNDECIDED)
3307 return;
3308 if (eol_type == CODING_EOL_INCONSISTENT)
3309 {
3310 #if 0
3311 /* This code is suppressed until we find a better way to
3312 distinguish raw text file and binary file. */
3313
3314 /* If we have already detected that the coding is raw-text, the
3315 coding should actually be no-conversion. */
3316 if (coding->type == coding_type_raw_text)
3317 {
3318 setup_coding_system (Qno_conversion, coding);
3319 return;
3320 }
3321 /* Else, let's decode only text code anyway. */
3322 #endif /* 0 */
3323 eol_type = CODING_EOL_LF;
3324 }
3325
3326 val = Fget (coding->symbol, Qeol_type);
3327 if (VECTORP (val) && XVECTOR (val)->size == 3)
3328 {
3329 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3330 coding->heading_ascii = skip;
3331 }
3332 }
3333
3334 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3335
3336 #define DECODING_BUFFER_MAG(coding) \
3337 (coding->type == coding_type_iso2022 \
3338 ? 3 \
3339 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3340 ? 2 \
3341 : (coding->type == coding_type_raw_text \
3342 ? 1 \
3343 : (coding->type == coding_type_ccl \
3344 ? coding->spec.ccl.decoder.buf_magnification \
3345 : 2))))
3346
3347 /* Return maximum size (bytes) of a buffer enough for decoding
3348 SRC_BYTES of text encoded in CODING. */
3349
3350 int
3351 decoding_buffer_size (coding, src_bytes)
3352 struct coding_system *coding;
3353 int src_bytes;
3354 {
3355 return (src_bytes * DECODING_BUFFER_MAG (coding)
3356 + CONVERSION_BUFFER_EXTRA_ROOM);
3357 }
3358
3359 /* Return maximum size (bytes) of a buffer enough for encoding
3360 SRC_BYTES of text to CODING. */
3361
3362 int
3363 encoding_buffer_size (coding, src_bytes)
3364 struct coding_system *coding;
3365 int src_bytes;
3366 {
3367 int magnification;
3368
3369 if (coding->type == coding_type_ccl)
3370 magnification = coding->spec.ccl.encoder.buf_magnification;
3371 else
3372 magnification = 3;
3373
3374 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3375 }
3376
3377 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3378 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3379 #endif
3380
3381 char *conversion_buffer;
3382 int conversion_buffer_size;
3383
3384 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3385 or decoding. Sufficient memory is allocated automatically. If we
3386 run out of memory, return NULL. */
3387
3388 char *
3389 get_conversion_buffer (size)
3390 int size;
3391 {
3392 if (size > conversion_buffer_size)
3393 {
3394 char *buf;
3395 int real_size = conversion_buffer_size * 2;
3396
3397 while (real_size < size) real_size *= 2;
3398 buf = (char *) xmalloc (real_size);
3399 xfree (conversion_buffer);
3400 conversion_buffer = buf;
3401 conversion_buffer_size = real_size;
3402 }
3403 return conversion_buffer;
3404 }
3405
3406 int
3407 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3408 struct coding_system *coding;
3409 unsigned char *source, *destination;
3410 int src_bytes, dst_bytes, encodep;
3411 {
3412 struct ccl_program *ccl
3413 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3414 int result;
3415
3416 coding->produced = ccl_driver (ccl, source, destination,
3417 src_bytes, dst_bytes, &(coding->consumed));
3418 if (encodep)
3419 {
3420 coding->produced_char = coding->produced;
3421 coding->consumed_char
3422 = multibyte_chars_in_text (source, coding->consumed);
3423 }
3424 else
3425 {
3426 coding->produced_char
3427 = multibyte_chars_in_text (destination, coding->produced);
3428 coding->consumed_char = coding->consumed;
3429 }
3430 switch (ccl->status)
3431 {
3432 case CCL_STAT_SUSPEND_BY_SRC:
3433 result = CODING_FINISH_INSUFFICIENT_SRC;
3434 break;
3435 case CCL_STAT_SUSPEND_BY_DST:
3436 result = CODING_FINISH_INSUFFICIENT_DST;
3437 break;
3438 default:
3439 result = CODING_FINISH_NORMAL;
3440 break;
3441 }
3442 return result;
3443 }
3444
3445 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3446 decoding, it may detect coding system and format of end-of-line if
3447 those are not yet decided. */
3448
3449 int
3450 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3451 struct coding_system *coding;
3452 unsigned char *source, *destination;
3453 int src_bytes, dst_bytes;
3454 {
3455 int result;
3456
3457 if (src_bytes <= 0)
3458 {
3459 coding->produced = coding->produced_char = 0;
3460 coding->consumed = coding->consumed_char = 0;
3461 return CODING_FINISH_NORMAL;
3462 }
3463
3464 if (coding->type == coding_type_undecided)
3465 detect_coding (coding, source, src_bytes);
3466
3467 if (coding->eol_type == CODING_EOL_UNDECIDED)
3468 detect_eol (coding, source, src_bytes);
3469
3470 switch (coding->type)
3471 {
3472 case coding_type_emacs_mule:
3473 case coding_type_undecided:
3474 case coding_type_raw_text:
3475 if (coding->eol_type == CODING_EOL_LF
3476 || coding->eol_type == CODING_EOL_UNDECIDED)
3477 goto label_no_conversion;
3478 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3479 break;
3480
3481 case coding_type_sjis:
3482 result = decode_coding_sjis_big5 (coding, source, destination,
3483 src_bytes, dst_bytes, 1);
3484 break;
3485
3486 case coding_type_iso2022:
3487 result = decode_coding_iso2022 (coding, source, destination,
3488 src_bytes, dst_bytes);
3489 break;
3490
3491 case coding_type_big5:
3492 result = decode_coding_sjis_big5 (coding, source, destination,
3493 src_bytes, dst_bytes, 0);
3494 break;
3495
3496 case coding_type_ccl:
3497 result = ccl_coding_driver (coding, source, destination,
3498 src_bytes, dst_bytes, 0);
3499 break;
3500
3501 default: /* i.e. case coding_type_no_conversion: */
3502 label_no_conversion:
3503 if (dst_bytes && src_bytes > dst_bytes)
3504 {
3505 coding->produced = dst_bytes;
3506 result = CODING_FINISH_INSUFFICIENT_DST;
3507 }
3508 else
3509 {
3510 coding->produced = src_bytes;
3511 result = CODING_FINISH_NORMAL;
3512 }
3513 if (dst_bytes)
3514 bcopy (source, destination, coding->produced);
3515 else
3516 safe_bcopy (source, destination, coding->produced);
3517 coding->consumed
3518 = coding->consumed_char = coding->produced_char = coding->produced;
3519 break;
3520 }
3521
3522 return result;
3523 }
3524
3525 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3526
3527 int
3528 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3529 struct coding_system *coding;
3530 unsigned char *source, *destination;
3531 int src_bytes, dst_bytes;
3532 {
3533 int result;
3534
3535 if (src_bytes <= 0)
3536 {
3537 coding->produced = coding->produced_char = 0;
3538 coding->consumed = coding->consumed_char = 0;
3539 return CODING_FINISH_NORMAL;
3540 }
3541
3542 switch (coding->type)
3543 {
3544 case coding_type_emacs_mule:
3545 case coding_type_undecided:
3546 case coding_type_raw_text:
3547 if (coding->eol_type == CODING_EOL_LF
3548 || coding->eol_type == CODING_EOL_UNDECIDED)
3549 goto label_no_conversion;
3550 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3551 break;
3552
3553 case coding_type_sjis:
3554 result = encode_coding_sjis_big5 (coding, source, destination,
3555 src_bytes, dst_bytes, 1);
3556 break;
3557
3558 case coding_type_iso2022:
3559 result = encode_coding_iso2022 (coding, source, destination,
3560 src_bytes, dst_bytes);
3561 break;
3562
3563 case coding_type_big5:
3564 result = encode_coding_sjis_big5 (coding, source, destination,
3565 src_bytes, dst_bytes, 0);
3566 break;
3567
3568 case coding_type_ccl:
3569 result = ccl_coding_driver (coding, source, destination,
3570 src_bytes, dst_bytes, 1);
3571 break;
3572
3573 default: /* i.e. case coding_type_no_conversion: */
3574 label_no_conversion:
3575 if (dst_bytes && src_bytes > dst_bytes)
3576 {
3577 coding->produced = dst_bytes;
3578 result = CODING_FINISH_INSUFFICIENT_DST;
3579 }
3580 else
3581 {
3582 coding->produced = src_bytes;
3583 result = CODING_FINISH_NORMAL;
3584 }
3585 if (dst_bytes)
3586 bcopy (source, destination, coding->produced);
3587 else
3588 safe_bcopy (source, destination, coding->produced);
3589 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3590 {
3591 unsigned char *p = destination, *pend = p + coding->produced;
3592 while (p < pend)
3593 if (*p++ == '\015') p[-1] = '\n';
3594 }
3595 coding->consumed
3596 = coding->consumed_char = coding->produced_char = coding->produced;
3597 break;
3598 }
3599
3600 return result;
3601 }
3602
3603 /* Scan text in the region between *BEG and *END, skip characters
3604 which we don't have to decode by coding system CODING at the head
3605 and tail, then set *BEG and *END to the region of the text we
3606 actually have to convert.
3607
3608 If STR is not NULL, *BEG and *END are indices into STR. */
3609
3610 static void
3611 shrink_decoding_region (beg, end, coding, str)
3612 int *beg, *end;
3613 struct coding_system *coding;
3614 unsigned char *str;
3615 {
3616 unsigned char *begp_orig, *begp, *endp_orig, *endp;
3617 int eol_conversion;
3618
3619 if (coding->type == coding_type_ccl
3620 || coding->type == coding_type_undecided
3621 || !NILP (coding->post_read_conversion))
3622 {
3623 /* We can't skip any data. */
3624 return;
3625 }
3626 else if (coding->type == coding_type_no_conversion)
3627 {
3628 /* We need no conversion. */
3629 *beg = *end;
3630 return;
3631 }
3632
3633 if (coding->heading_ascii >= 0)
3634 /* Detection routine has already found how much we can skip at the
3635 head. */
3636 *beg += coding->heading_ascii;
3637
3638 if (str)
3639 {
3640 begp_orig = begp = str + *beg;
3641 endp_orig = endp = str + *end;
3642 }
3643 else
3644 {
3645 move_gap (*beg);
3646 begp_orig = begp = GAP_END_ADDR;
3647 endp_orig = endp = begp + *end - *beg;
3648 }
3649
3650 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3651
3652 switch (coding->type)
3653 {
3654 case coding_type_emacs_mule:
3655 case coding_type_raw_text:
3656 if (eol_conversion)
3657 {
3658 if (coding->heading_ascii < 0)
3659 while (begp < endp && *begp != '\r') begp++;
3660 while (begp < endp && *(endp - 1) != '\r') endp--;
3661 }
3662 else
3663 begp = endp;
3664 break;
3665
3666 case coding_type_sjis:
3667 case coding_type_big5:
3668 /* We can skip all ASCII characters at the head. */
3669 if (coding->heading_ascii < 0)
3670 {
3671 if (eol_conversion)
3672 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3673 else
3674 while (begp < endp && *begp < 0x80) begp++;
3675 }
3676 /* We can skip all ASCII characters at the tail except for the
3677 second byte of SJIS or BIG5 code. */
3678 if (eol_conversion)
3679 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3680 else
3681 while (begp < endp && endp[-1] < 0x80) endp--;
3682 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3683 endp++;
3684 break;
3685
3686 default: /* i.e. case coding_type_iso2022: */
3687 if (coding->heading_ascii < 0)
3688 {
3689 unsigned char c;
3690
3691 /* We can skip all ASCII characters at the head except for a
3692 few control codes. */
3693 while (begp < endp && (c = *begp) < 0x80
3694 && c != ISO_CODE_CR && c != ISO_CODE_SO
3695 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3696 && (!eol_conversion || c != ISO_CODE_LF))
3697 begp++;
3698 }
3699 switch (coding->category_idx)
3700 {
3701 case CODING_CATEGORY_IDX_ISO_8_1:
3702 case CODING_CATEGORY_IDX_ISO_8_2:
3703 /* We can skip all ASCII characters at the tail. */
3704 if (eol_conversion)
3705 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3706 else
3707 while (begp < endp && endp[-1] < 0x80) endp--;
3708 break;
3709
3710 case CODING_CATEGORY_IDX_ISO_7:
3711 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3712 /* We can skip all charactes at the tail except for ESC and
3713 the following 2-byte at the tail. */
3714 if (eol_conversion)
3715 while (begp < endp && endp[-1] != ISO_CODE_ESC && endp[-1] != '\n')
3716 endp--;
3717 else
3718 while (begp < endp && endp[-1] != ISO_CODE_ESC)
3719 endp--;
3720 if (begp < endp && endp[-1] == ISO_CODE_ESC)
3721 {
3722 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
3723 /* This is an ASCII designation sequence. We can
3724 surely skip the tail. */
3725 endp += 2;
3726 else
3727 /* Hmmm, we can't skip the tail. */
3728 endp = endp_orig;
3729 }
3730 }
3731 }
3732 *beg += begp - begp_orig;
3733 *end += endp - endp_orig;
3734 return;
3735 }
3736
3737 /* Like shrink_decoding_region but for encoding. */
3738
3739 static void
3740 shrink_encoding_region (beg, end, coding, str)
3741 int *beg, *end;
3742 struct coding_system *coding;
3743 unsigned char *str;
3744 {
3745 unsigned char *begp_orig, *begp, *endp_orig, *endp;
3746 int eol_conversion;
3747
3748 if (coding->type == coding_type_ccl)
3749 /* We can't skip any data. */
3750 return;
3751 else if (coding->type == coding_type_no_conversion)
3752 {
3753 /* We need no conversion. */
3754 *beg = *end;
3755 return;
3756 }
3757
3758 if (str)
3759 {
3760 begp_orig = begp = str + *beg;
3761 endp_orig = endp = str + *end;
3762 }
3763 else
3764 {
3765 move_gap (*beg);
3766 begp_orig = begp = GAP_END_ADDR;
3767 endp_orig = endp = begp + *end - *beg;
3768 }
3769
3770 eol_conversion = (coding->eol_type == CODING_EOL_CR
3771 || coding->eol_type == CODING_EOL_CRLF);
3772
3773 /* Here, we don't have to check coding->pre_write_conversion because
3774 the caller is expected to have handled it already. */
3775 switch (coding->type)
3776 {
3777 case coding_type_undecided:
3778 case coding_type_emacs_mule:
3779 case coding_type_raw_text:
3780 if (eol_conversion)
3781 {
3782 while (begp < endp && *begp != '\n') begp++;
3783 while (begp < endp && endp[-1] != '\n') endp--;
3784 }
3785 else
3786 begp = endp;
3787 break;
3788
3789 case coding_type_iso2022:
3790 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3791 {
3792 unsigned char *bol = begp;
3793 while (begp < endp && *begp < 0x80)
3794 {
3795 begp++;
3796 if (begp[-1] == '\n')
3797 bol = begp;
3798 }
3799 begp = bol;
3800 goto label_skip_tail;
3801 }
3802 /* fall down ... */
3803
3804 default:
3805 /* We can skip all ASCII characters at the head and tail. */
3806 if (eol_conversion)
3807 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3808 else
3809 while (begp < endp && *begp < 0x80) begp++;
3810 label_skip_tail:
3811 if (eol_conversion)
3812 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3813 else
3814 while (begp < endp && *(endp - 1) < 0x80) endp--;
3815 break;
3816 }
3817
3818 *beg += begp - begp_orig;
3819 *end += endp - endp_orig;
3820 return;
3821 }
3822
3823 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3824 text from FROM to TO by coding system CODING, and return number of
3825 characters in the resulting text.
3826
3827 If ADJUST is nonzero, we do various things as if the original text
3828 is deleted and a new text is inserted. See the comments in
3829 replace_range (insdel.c) to know what we are doing.
3830
3831 ADJUST nonzero also means that post-read-conversion or
3832 pre-write-conversion functions (if any) should be processed. */
3833
3834 int
3835 code_convert_region (from, to, coding, encodep, adjust)
3836 int from, to, encodep, adjust;
3837 struct coding_system *coding;
3838 {
3839 int len = to - from, require, inserted, inserted_byte;
3840 int from_byte, to_byte, len_byte;
3841 int from_byte_orig, to_byte_orig;
3842 Lisp_Object saved_coding_symbol = Qnil;
3843
3844 if (adjust)
3845 {
3846 prepare_to_modify_buffer (from, to, &from);
3847 to = from + len;
3848 }
3849 from_byte = CHAR_TO_BYTE (from); to_byte = CHAR_TO_BYTE (to);
3850 len_byte = to_byte - from_byte;
3851
3852 if (! encodep && CODING_REQUIRE_DETECTION (coding))
3853 {
3854 /* We must detect encoding of text and eol. Even if detection
3855 routines can't decide the encoding, we should not let them
3856 undecided because the deeper decoding routine (decode_coding)
3857 tries to detect the encodings in vain in that case. */
3858
3859 if (from < GPT && to > GPT)
3860 move_gap_both (from, from_byte);
3861 if (coding->type == coding_type_undecided)
3862 {
3863 detect_coding (coding, BYTE_POS_ADDR (from), len);
3864 if (coding->type == coding_type_undecided)
3865 coding->type = coding_type_emacs_mule;
3866 }
3867 if (coding->eol_type == CODING_EOL_UNDECIDED)
3868 {
3869 saved_coding_symbol = coding->symbol;
3870 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
3871 if (coding->eol_type == CODING_EOL_UNDECIDED)
3872 coding->eol_type = CODING_EOL_LF;
3873 /* We had better recover the original eol format if we
3874 encounter an inconsitent eol format while decoding. */
3875 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3876 }
3877 }
3878
3879 if (encodep
3880 ? ! CODING_REQUIRE_ENCODING (coding)
3881 : ! CODING_REQUIRE_DECODING (coding))
3882 return len;
3883
3884 /* Now we convert the text. */
3885
3886 /* For encoding, we must process pre-write-conversion in advance. */
3887 if (encodep
3888 && adjust
3889 && ! NILP (coding->pre_write_conversion)
3890 && SYMBOLP (coding->pre_write_conversion)
3891 && ! NILP (Ffboundp (coding->pre_write_conversion)))
3892 {
3893 /* The function in pre-write-conversion put a new text in a new
3894 buffer. */
3895 struct buffer *prev = current_buffer, *new;
3896
3897 call2 (coding->pre_write_conversion, from, to);
3898 if (current_buffer != prev)
3899 {
3900 len = ZV - BEGV;
3901 new = current_buffer;
3902 set_buffer_internal_1 (prev);
3903 del_range (from, to);
3904 insert_from_buffer (new, BEG, len, 0);
3905 to = from + len;
3906 to_byte = CHAR_TO_BYTE (to);
3907 len_byte = to_byte - from_byte;
3908 }
3909 }
3910
3911 /* Try to skip the heading and tailing ASCIIs. */
3912 from_byte_orig = from_byte; to_byte_orig = to_byte;
3913 if (encodep)
3914 shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
3915 else
3916 shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
3917 if (from_byte == to_byte)
3918 return len;
3919 /* Here, the excluded region by shrinking contains only ASCIIs. */
3920 from += (from_byte - from_byte_orig);
3921 to += (to_byte - to_byte_orig);
3922 len = to - from;
3923 len_byte = to_byte - from_byte;
3924
3925 /* For converion, we must put the gap before the text to be decoded
3926 in addition to make the gap larger for efficient decoding. The
3927 required gap size starts from 2000 which is the magic number used
3928 in make_gap. But, after one batch of conversion, it will be
3929 incremented if we find that it is not enough . */
3930 require = 2000;
3931
3932 if (GAP_SIZE < require)
3933 make_gap (require - GAP_SIZE);
3934 move_gap_both (from, from_byte);
3935
3936 if (adjust)
3937 adjust_before_replace (from, from_byte, to, to_byte);
3938
3939 if (GPT - BEG < beg_unchanged)
3940 beg_unchanged = GPT - BEG;
3941 if (Z - GPT < end_unchanged)
3942 end_unchanged = Z - GPT;
3943
3944 inserted = inserted_byte = 0;
3945 for (;;)
3946 {
3947 int result, diff_char, diff_byte;
3948
3949 /* The buffer memory is changed from:
3950 +--------+converted-text+------------+-----original-text-----+---+
3951 |<-from->|<--inserted-->|<-GAP_SIZE->|<---------len--------->|---| */
3952
3953 if (encodep)
3954 result = encode_coding (coding, GAP_END_ADDR, GPT_ADDR, len_byte, 0);
3955 else
3956 result = decode_coding (coding, GAP_END_ADDR, GPT_ADDR, len_byte, 0);
3957 /* to:
3958 +--------+-------converted-text--------+--+---original-text--+---+
3959 |<-from->|<----(inserted+produced)---->|--|<-(len-consumed)->|---| */
3960
3961 diff_char = coding->produced_char - coding->consumed_char;
3962 diff_byte = coding->produced - coding->consumed;
3963
3964 GAP_SIZE -= diff_byte;
3965 ZV += diff_char; ZV_BYTE += diff_byte;
3966 Z += diff_char; Z_BYTE += diff_byte;
3967 GPT += coding->produced_char; GPT_BYTE += coding->produced;
3968
3969 inserted += coding->produced_char;
3970 inserted_byte += coding->produced;
3971 len -= coding->consumed_char;
3972 len_byte -= coding->consumed;
3973
3974 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
3975 {
3976 unsigned char *p = GPT_ADDR - inserted_byte, *pend = GPT_ADDR;
3977
3978 /* Encode LFs back to the original eol format (CR or CRLF). */
3979 if (coding->eol_type == CODING_EOL_CR)
3980 {
3981 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
3982 }
3983 else
3984 {
3985 unsigned char *p2 = p;
3986 int count = 0;
3987
3988 while (p2 < pend) if (*p2++ == '\n') count++;
3989 if (GAP_SIZE < count)
3990 make_gap (count - GAP_SIZE);
3991 p2 = GPT_ADDR + count;
3992 while (p < pend)
3993 {
3994 *--p2 = *--pend;
3995 if (*pend == '\n') *--p2 = '\r';
3996 }
3997 GPT += count; GAP_SIZE -= count; ZV += count; Z += count;
3998 ZV_BYTE += count; Z_BYTE += count;
3999 coding->produced += count;
4000 coding->produced_char += count;
4001 inserted += count;
4002 inserted_byte += count;
4003 }
4004
4005 /* Suppress eol-format conversion in the further conversion. */
4006 coding->eol_type = CODING_EOL_LF;
4007
4008 /* Restore the original symbol. */
4009 coding->symbol = saved_coding_symbol;
4010 }
4011 if (len_byte <= 0)
4012 break;
4013 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4014 {
4015 /* The source text ends in invalid codes. Let's just
4016 make them valid buffer contents, and finish conversion. */
4017 inserted += len;
4018 inserted_byte += len_byte;
4019 break;
4020 }
4021 if (inserted == coding->produced_char)
4022 /* We have just done the first batch of conversion. Let's
4023 reconsider the required gap size now.
4024
4025 We have converted CONSUMED bytes into PRODUCED bytes. To
4026 convert the remaining LEN bytes, we may need REQUIRE bytes
4027 of gap, where:
4028 REQUIRE + LEN = (LEN * PRODUCED / CONSUMED)
4029 REQUIRE = LEN * (PRODUCED - CONSUMED) / CONSUMED
4030 = LEN * DIFF / CONSUMED
4031 Here, we are sure that DIFF is positive. */
4032 require = len_byte * diff_byte / coding->consumed;
4033 if (GAP_SIZE < require)
4034 make_gap (require - GAP_SIZE);
4035 }
4036 if (GAP_SIZE > 0) *GPT_ADDR = 0; /* Put an anchor. */
4037
4038 if (adjust)
4039 {
4040 adjust_after_replace (from, from_byte, to, to_byte,
4041 inserted, inserted_byte);
4042
4043 if (! encodep && ! NILP (coding->post_read_conversion))
4044 {
4045 Lisp_Object val;
4046 int orig_inserted = inserted, pos = PT;
4047
4048 temp_set_point_both (current_buffer, from, from_byte);
4049 val = call1 (coding->post_read_conversion, make_number (inserted));
4050 if (! NILP (val))
4051 {
4052 CHECK_NUMBER (val, 0);
4053 inserted = XFASTINT (val);
4054 }
4055 if (pos >= from + orig_inserted)
4056 temp_set_point (current_buffer, pos + (inserted - orig_inserted));
4057 }
4058 }
4059
4060 return ((from_byte - from_byte_orig) + inserted + (to_byte_orig - to_byte));
4061 }
4062
4063 Lisp_Object
4064 code_convert_string (str, coding, encodep, nocopy)
4065 Lisp_Object str;
4066 struct coding_system *coding;
4067 int encodep, nocopy;
4068 {
4069 int len;
4070 char *buf;
4071 int from = 0, to = XSTRING (str)->size, to_byte = XSTRING (str)->size_byte;
4072 struct gcpro gcpro1;
4073 Lisp_Object saved_coding_symbol = Qnil;
4074 int result;
4075
4076 if (encodep && !NILP (coding->pre_write_conversion)
4077 || !encodep && !NILP (coding->post_read_conversion))
4078 {
4079 /* Since we have to call Lisp functions which assume target text
4080 is in a buffer, after setting a temporary buffer, call
4081 code_convert_region. */
4082 int count = specpdl_ptr - specpdl;
4083 struct buffer *prev = current_buffer;
4084
4085 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4086 temp_output_buffer_setup (" *code-converting-work*");
4087 set_buffer_internal (XBUFFER (Vstandard_output));
4088 if (encodep)
4089 insert_from_string (str, 0, 0, to, to_byte, 0);
4090 else
4091 {
4092 /* We must insert the contents of STR as is without
4093 unibyte<->multibyte conversion. */
4094 current_buffer->enable_multibyte_characters = Qnil;
4095 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4096 current_buffer->enable_multibyte_characters = Qt;
4097 }
4098 code_convert_region (BEGV, ZV, coding, encodep, 1);
4099 if (encodep)
4100 /* We must return the buffer contents as unibyte string. */
4101 current_buffer->enable_multibyte_characters = Qnil;
4102 str = make_buffer_string (BEGV, ZV, 0);
4103 set_buffer_internal (prev);
4104 return unbind_to (count, str);
4105 }
4106
4107 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4108 {
4109 /* See the comments in code_convert_region. */
4110 if (coding->type == coding_type_undecided)
4111 {
4112 detect_coding (coding, XSTRING (str)->data, to_byte);
4113 if (coding->type == coding_type_undecided)
4114 coding->type = coding_type_emacs_mule;
4115 }
4116 if (coding->eol_type == CODING_EOL_UNDECIDED)
4117 {
4118 saved_coding_symbol = coding->symbol;
4119 detect_eol (coding, XSTRING (str)->data, to_byte);
4120 if (coding->eol_type == CODING_EOL_UNDECIDED)
4121 coding->eol_type = CODING_EOL_LF;
4122 /* We had better recover the original eol format if we
4123 encounter an inconsitent eol format while decoding. */
4124 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4125 }
4126 }
4127
4128 if (encodep
4129 ? ! CODING_REQUIRE_ENCODING (coding)
4130 : ! CODING_REQUIRE_DECODING (coding))
4131 from = to_byte;
4132 else
4133 {
4134 /* Try to skip the heading and tailing ASCIIs. */
4135 if (encodep)
4136 shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4137 else
4138 shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4139 }
4140 if (from == to_byte)
4141 return (nocopy ? str : Fcopy_sequence (str));
4142
4143 if (encodep)
4144 len = encoding_buffer_size (coding, to_byte - from);
4145 else
4146 len = decoding_buffer_size (coding, to_byte - from);
4147 len += from + XSTRING (str)->size_byte - to_byte;
4148 GCPRO1 (str);
4149 buf = get_conversion_buffer (len);
4150 UNGCPRO;
4151
4152 if (from > 0)
4153 bcopy (XSTRING (str)->data, buf, from);
4154 result = (encodep
4155 ? encode_coding (coding, XSTRING (str)->data + from,
4156 buf + from, to_byte - from, len)
4157 : decode_coding (coding, XSTRING (str)->data + from,
4158 buf + from, to - from, len));
4159 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4160 {
4161 /* We simple try to decode the whole string again but without
4162 eol-conversion this time. */
4163 coding->eol_type = CODING_EOL_LF;
4164 coding->symbol = saved_coding_symbol;
4165 return code_convert_string (str, coding, encodep, nocopy);
4166 }
4167
4168 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4169 XSTRING (str)->size_byte - to_byte);
4170
4171 len = from + XSTRING (str)->size_byte - to_byte;
4172 if (encodep)
4173 str = make_unibyte_string (buf, len + coding->produced);
4174 else
4175 str = make_multibyte_string (buf, len + coding->produced_char,
4176 len + coding->produced);
4177 return str;
4178 }
4179
4180 \f
4181 #ifdef emacs
4182 /*** 7. Emacs Lisp library functions ***/
4183
4184 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4185 "Return t if OBJECT is nil or a coding-system.\n\
4186 See the documentation of `make-coding-system' for information\n\
4187 about coding-system objects.")
4188 (obj)
4189 Lisp_Object obj;
4190 {
4191 if (NILP (obj))
4192 return Qt;
4193 if (!SYMBOLP (obj))
4194 return Qnil;
4195 /* Get coding-spec vector for OBJ. */
4196 obj = Fget (obj, Qcoding_system);
4197 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4198 ? Qt : Qnil);
4199 }
4200
4201 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4202 Sread_non_nil_coding_system, 1, 1, 0,
4203 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4204 (prompt)
4205 Lisp_Object prompt;
4206 {
4207 Lisp_Object val;
4208 do
4209 {
4210 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4211 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4212 }
4213 while (XSTRING (val)->size == 0);
4214 return (Fintern (val, Qnil));
4215 }
4216
4217 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4218 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4219 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4220 (prompt, default_coding_system)
4221 Lisp_Object prompt, default_coding_system;
4222 {
4223 Lisp_Object val;
4224 if (SYMBOLP (default_coding_system))
4225 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4226 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4227 Qt, Qnil, Qcoding_system_history,
4228 default_coding_system, Qnil);
4229 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4230 }
4231
4232 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4233 1, 1, 0,
4234 "Check validity of CODING-SYSTEM.\n\
4235 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4236 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4237 The value of property should be a vector of length 5.")
4238 (coding_system)
4239 Lisp_Object coding_system;
4240 {
4241 CHECK_SYMBOL (coding_system, 0);
4242 if (!NILP (Fcoding_system_p (coding_system)))
4243 return coding_system;
4244 while (1)
4245 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4246 }
4247 \f
4248 Lisp_Object
4249 detect_coding_system (src, src_bytes, highest)
4250 unsigned char *src;
4251 int src_bytes, highest;
4252 {
4253 int coding_mask, eol_type;
4254 Lisp_Object val, tmp;
4255 int dummy;
4256
4257 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4258 eol_type = detect_eol_type (src, src_bytes, &dummy);
4259 if (eol_type == CODING_EOL_INCONSISTENT)
4260 eol_type == CODING_EOL_UNDECIDED;
4261
4262 if (!coding_mask)
4263 {
4264 val = Qundecided;
4265 if (eol_type != CODING_EOL_UNDECIDED)
4266 {
4267 Lisp_Object val2;
4268 val2 = Fget (Qundecided, Qeol_type);
4269 if (VECTORP (val2))
4270 val = XVECTOR (val2)->contents[eol_type];
4271 }
4272 return val;
4273 }
4274
4275 /* At first, gather possible coding systems in VAL. */
4276 val = Qnil;
4277 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4278 {
4279 int idx
4280 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4281 if (coding_mask & (1 << idx))
4282 {
4283 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4284 if (highest)
4285 break;
4286 }
4287 }
4288 if (!highest)
4289 val = Fnreverse (val);
4290
4291 /* Then, substitute the elements by subsidiary coding systems. */
4292 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4293 {
4294 if (eol_type != CODING_EOL_UNDECIDED)
4295 {
4296 Lisp_Object eol;
4297 eol = Fget (XCONS (tmp)->car, Qeol_type);
4298 if (VECTORP (eol))
4299 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4300 }
4301 }
4302 return (highest ? XCONS (val)->car : val);
4303 }
4304
4305 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4306 2, 3, 0,
4307 "Detect coding system of the text in the region between START and END.\n\
4308 Return a list of possible coding systems ordered by priority.\n\
4309 \n\
4310 If only ASCII characters are found, it returns `undecided'\n\
4311 or its subsidiary coding system according to a detected end-of-line format.\n\
4312 \n\
4313 If optional argument HIGHEST is non-nil, return the coding system of\n\
4314 highest priority.")
4315 (start, end, highest)
4316 Lisp_Object start, end, highest;
4317 {
4318 int from, to;
4319 int from_byte, to_byte;
4320
4321 CHECK_NUMBER_COERCE_MARKER (start, 0);
4322 CHECK_NUMBER_COERCE_MARKER (end, 1);
4323
4324 validate_region (&start, &end);
4325 from = XINT (start), to = XINT (end);
4326 from_byte = CHAR_TO_BYTE (from);
4327 to_byte = CHAR_TO_BYTE (to);
4328
4329 if (from < GPT && to >= GPT)
4330 move_gap_both (to, to_byte);
4331
4332 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4333 to_byte - from_byte,
4334 !NILP (highest));
4335 }
4336
4337 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4338 1, 2, 0,
4339 "Detect coding system of the text in STRING.\n\
4340 Return a list of possible coding systems ordered by priority.\n\
4341 \n\
4342 If only ASCII characters are found, it returns `undecided'\n\
4343 or its subsidiary coding system according to a detected end-of-line format.\n\
4344 \n\
4345 If optional argument HIGHEST is non-nil, return the coding system of\n\
4346 highest priority.")
4347 (string, highest)
4348 Lisp_Object string, highest;
4349 {
4350 CHECK_STRING (string, 0);
4351
4352 return detect_coding_system (XSTRING (string)->data,
4353 XSTRING (string)->size_byte,
4354 !NILP (highest));
4355 }
4356
4357 Lisp_Object
4358 code_convert_region1 (start, end, coding_system, encodep)
4359 Lisp_Object start, end, coding_system;
4360 int encodep;
4361 {
4362 struct coding_system coding;
4363 int from, to, len;
4364
4365 CHECK_NUMBER_COERCE_MARKER (start, 0);
4366 CHECK_NUMBER_COERCE_MARKER (end, 1);
4367 CHECK_SYMBOL (coding_system, 2);
4368
4369 validate_region (&start, &end);
4370 from = XFASTINT (start);
4371 to = XFASTINT (end);
4372
4373 if (NILP (coding_system))
4374 return make_number (to - from);
4375
4376 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4377 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4378
4379 coding.mode |= CODING_MODE_LAST_BLOCK;
4380 len = code_convert_region (from, to, &coding, encodep, 1);
4381 return make_number (len);
4382 }
4383
4384 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4385 3, 3, "r\nzCoding system: ",
4386 "Decode the current region by specified coding system.\n\
4387 When called from a program, takes three arguments:\n\
4388 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4389 Return length of decoded text.")
4390 (start, end, coding_system)
4391 Lisp_Object start, end, coding_system;
4392 {
4393 return code_convert_region1 (start, end, coding_system, 0);
4394 }
4395
4396 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4397 3, 3, "r\nzCoding system: ",
4398 "Encode the current region by specified coding system.\n\
4399 When called from a program, takes three arguments:\n\
4400 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4401 Return length of encoded text.")
4402 (start, end, coding_system)
4403 Lisp_Object start, end, coding_system;
4404 {
4405 return code_convert_region1 (start, end, coding_system, 1);
4406 }
4407
4408 Lisp_Object
4409 code_convert_string1 (string, coding_system, nocopy, encodep)
4410 Lisp_Object string, coding_system, nocopy;
4411 int encodep;
4412 {
4413 struct coding_system coding;
4414
4415 CHECK_STRING (string, 0);
4416 CHECK_SYMBOL (coding_system, 1);
4417
4418 if (NILP (coding_system))
4419 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4420
4421 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4422 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4423
4424 coding.mode |= CODING_MODE_LAST_BLOCK;
4425 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4426 }
4427
4428 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4429 2, 3, 0,
4430 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4431 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4432 if the decoding operation is trivial.")
4433 (string, coding_system, nocopy)
4434 Lisp_Object string, coding_system, nocopy;
4435 {
4436 return code_convert_string1(string, coding_system, nocopy, 0);
4437 }
4438
4439 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4440 2, 3, 0,
4441 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4442 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4443 if the encoding operation is trivial.")
4444 (string, coding_system, nocopy)
4445 Lisp_Object string, coding_system, nocopy;
4446 {
4447 return code_convert_string1(string, coding_system, nocopy, 1);
4448 }
4449
4450 \f
4451 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
4452 "Decode a JISX0208 character of shift-jis encoding.\n\
4453 CODE is the character code in SJIS.\n\
4454 Return the corresponding character.")
4455 (code)
4456 Lisp_Object code;
4457 {
4458 unsigned char c1, c2, s1, s2;
4459 Lisp_Object val;
4460
4461 CHECK_NUMBER (code, 0);
4462 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
4463 DECODE_SJIS (s1, s2, c1, c2);
4464 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
4465 return val;
4466 }
4467
4468 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
4469 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4470 Return the corresponding character code in SJIS.")
4471 (ch)
4472 Lisp_Object ch;
4473 {
4474 int charset, c1, c2, s1, s2;
4475 Lisp_Object val;
4476
4477 CHECK_NUMBER (ch, 0);
4478 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4479 if (charset == charset_jisx0208)
4480 {
4481 ENCODE_SJIS (c1, c2, s1, s2);
4482 XSETFASTINT (val, (s1 << 8) | s2);
4483 }
4484 else
4485 XSETFASTINT (val, 0);
4486 return val;
4487 }
4488
4489 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
4490 "Decode a Big5 character CODE of BIG5 coding system.\n\
4491 CODE is the character code in BIG5.\n\
4492 Return the corresponding character.")
4493 (code)
4494 Lisp_Object code;
4495 {
4496 int charset;
4497 unsigned char b1, b2, c1, c2;
4498 Lisp_Object val;
4499
4500 CHECK_NUMBER (code, 0);
4501 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
4502 DECODE_BIG5 (b1, b2, charset, c1, c2);
4503 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
4504 return val;
4505 }
4506
4507 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
4508 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4509 Return the corresponding character code in Big5.")
4510 (ch)
4511 Lisp_Object ch;
4512 {
4513 int charset, c1, c2, b1, b2;
4514 Lisp_Object val;
4515
4516 CHECK_NUMBER (ch, 0);
4517 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4518 if (charset == charset_big5_1 || charset == charset_big5_2)
4519 {
4520 ENCODE_BIG5 (charset, c1, c2, b1, b2);
4521 XSETFASTINT (val, (b1 << 8) | b2);
4522 }
4523 else
4524 XSETFASTINT (val, 0);
4525 return val;
4526 }
4527 \f
4528 DEFUN ("set-terminal-coding-system-internal",
4529 Fset_terminal_coding_system_internal,
4530 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4531 (coding_system)
4532 Lisp_Object coding_system;
4533 {
4534 CHECK_SYMBOL (coding_system, 0);
4535 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4536 /* We had better not send unsafe characters to terminal. */
4537 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
4538
4539 return Qnil;
4540 }
4541
4542 DEFUN ("set-safe-terminal-coding-system-internal",
4543 Fset_safe_terminal_coding_system_internal,
4544 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
4545 (coding_system)
4546 Lisp_Object coding_system;
4547 {
4548 CHECK_SYMBOL (coding_system, 0);
4549 setup_coding_system (Fcheck_coding_system (coding_system),
4550 &safe_terminal_coding);
4551 return Qnil;
4552 }
4553
4554 DEFUN ("terminal-coding-system",
4555 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
4556 "Return coding system specified for terminal output.")
4557 ()
4558 {
4559 return terminal_coding.symbol;
4560 }
4561
4562 DEFUN ("set-keyboard-coding-system-internal",
4563 Fset_keyboard_coding_system_internal,
4564 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4565 (coding_system)
4566 Lisp_Object coding_system;
4567 {
4568 CHECK_SYMBOL (coding_system, 0);
4569 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
4570 return Qnil;
4571 }
4572
4573 DEFUN ("keyboard-coding-system",
4574 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
4575 "Return coding system specified for decoding keyboard input.")
4576 ()
4577 {
4578 return keyboard_coding.symbol;
4579 }
4580
4581 \f
4582 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
4583 Sfind_operation_coding_system, 1, MANY, 0,
4584 "Choose a coding system for an operation based on the target name.\n\
4585 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4586 DECODING-SYSTEM is the coding system to use for decoding\n\
4587 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4588 for encoding (in case OPERATION does encoding).\n\
4589 \n\
4590 The first argument OPERATION specifies an I/O primitive:\n\
4591 For file I/O, `insert-file-contents' or `write-region'.\n\
4592 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4593 For network I/O, `open-network-stream'.\n\
4594 \n\
4595 The remaining arguments should be the same arguments that were passed\n\
4596 to the primitive. Depending on which primitive, one of those arguments\n\
4597 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4598 whichever argument specifies the file name is TARGET.\n\
4599 \n\
4600 TARGET has a meaning which depends on OPERATION:\n\
4601 For file I/O, TARGET is a file name.\n\
4602 For process I/O, TARGET is a process name.\n\
4603 For network I/O, TARGET is a service name or a port number\n\
4604 \n\
4605 This function looks up what specified for TARGET in,\n\
4606 `file-coding-system-alist', `process-coding-system-alist',\n\
4607 or `network-coding-system-alist' depending on OPERATION.\n\
4608 They may specify a coding system, a cons of coding systems,\n\
4609 or a function symbol to call.\n\
4610 In the last case, we call the function with one argument,\n\
4611 which is a list of all the arguments given to this function.")
4612 (nargs, args)
4613 int nargs;
4614 Lisp_Object *args;
4615 {
4616 Lisp_Object operation, target_idx, target, val;
4617 register Lisp_Object chain;
4618
4619 if (nargs < 2)
4620 error ("Too few arguments");
4621 operation = args[0];
4622 if (!SYMBOLP (operation)
4623 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
4624 error ("Invalid first arguement");
4625 if (nargs < 1 + XINT (target_idx))
4626 error ("Too few arguments for operation: %s",
4627 XSYMBOL (operation)->name->data);
4628 target = args[XINT (target_idx) + 1];
4629 if (!(STRINGP (target)
4630 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
4631 error ("Invalid %dth argument", XINT (target_idx) + 1);
4632
4633 chain = ((EQ (operation, Qinsert_file_contents)
4634 || EQ (operation, Qwrite_region))
4635 ? Vfile_coding_system_alist
4636 : (EQ (operation, Qopen_network_stream)
4637 ? Vnetwork_coding_system_alist
4638 : Vprocess_coding_system_alist));
4639 if (NILP (chain))
4640 return Qnil;
4641
4642 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4643 {
4644 Lisp_Object elt;
4645 elt = XCONS (chain)->car;
4646
4647 if (CONSP (elt)
4648 && ((STRINGP (target)
4649 && STRINGP (XCONS (elt)->car)
4650 && fast_string_match (XCONS (elt)->car, target) >= 0)
4651 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
4652 {
4653 val = XCONS (elt)->cdr;
4654 /* Here, if VAL is both a valid coding system and a valid
4655 function symbol, we return VAL as a coding system. */
4656 if (CONSP (val))
4657 return val;
4658 if (! SYMBOLP (val))
4659 return Qnil;
4660 if (! NILP (Fcoding_system_p (val)))
4661 return Fcons (val, val);
4662 if (! NILP (Ffboundp (val)))
4663 {
4664 val = call1 (val, Flist (nargs, args));
4665 if (CONSP (val))
4666 return val;
4667 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
4668 return Fcons (val, val);
4669 }
4670 return Qnil;
4671 }
4672 }
4673 return Qnil;
4674 }
4675
4676 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems,
4677 Supdate_iso_coding_systems, 0, 0, 0,
4678 "Update internal database for ISO2022 based coding systems.\n\
4679 When values of the following coding categories are changed, you must\n\
4680 call this function:\n\
4681 coding-category-iso-7, coding-category-iso-7-tight,\n\
4682 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4683 coding-category-iso-7-else, coding-category-iso-8-else")
4684 ()
4685 {
4686 int i;
4687
4688 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_ISO_8_ELSE;
4689 i++)
4690 {
4691 if (! coding_system_table[i])
4692 coding_system_table[i]
4693 = (struct coding_system *) xmalloc (sizeof (struct coding_system));
4694 setup_coding_system
4695 (XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value,
4696 coding_system_table[i]);
4697 }
4698 return Qnil;
4699 }
4700
4701 #endif /* emacs */
4702
4703 \f
4704 /*** 8. Post-amble ***/
4705
4706 init_coding_once ()
4707 {
4708 int i;
4709
4710 /* Emacs' internal format specific initialize routine. */
4711 for (i = 0; i <= 0x20; i++)
4712 emacs_code_class[i] = EMACS_control_code;
4713 emacs_code_class[0x0A] = EMACS_linefeed_code;
4714 emacs_code_class[0x0D] = EMACS_carriage_return_code;
4715 for (i = 0x21 ; i < 0x7F; i++)
4716 emacs_code_class[i] = EMACS_ascii_code;
4717 emacs_code_class[0x7F] = EMACS_control_code;
4718 emacs_code_class[0x80] = EMACS_leading_code_composition;
4719 for (i = 0x81; i < 0xFF; i++)
4720 emacs_code_class[i] = EMACS_invalid_code;
4721 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
4722 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
4723 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
4724 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
4725
4726 /* ISO2022 specific initialize routine. */
4727 for (i = 0; i < 0x20; i++)
4728 iso_code_class[i] = ISO_control_code;
4729 for (i = 0x21; i < 0x7F; i++)
4730 iso_code_class[i] = ISO_graphic_plane_0;
4731 for (i = 0x80; i < 0xA0; i++)
4732 iso_code_class[i] = ISO_control_code;
4733 for (i = 0xA1; i < 0xFF; i++)
4734 iso_code_class[i] = ISO_graphic_plane_1;
4735 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
4736 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4737 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
4738 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
4739 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
4740 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
4741 iso_code_class[ISO_CODE_ESC] = ISO_escape;
4742 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
4743 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
4744 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
4745
4746 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
4747 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
4748
4749 setup_coding_system (Qnil, &keyboard_coding);
4750 setup_coding_system (Qnil, &terminal_coding);
4751 setup_coding_system (Qnil, &safe_terminal_coding);
4752
4753 bzero (coding_system_table, sizeof coding_system_table);
4754
4755 #if defined (MSDOS) || defined (WINDOWSNT)
4756 system_eol_type = CODING_EOL_CRLF;
4757 #else
4758 system_eol_type = CODING_EOL_LF;
4759 #endif
4760 }
4761
4762 #ifdef emacs
4763
4764 syms_of_coding ()
4765 {
4766 Qtarget_idx = intern ("target-idx");
4767 staticpro (&Qtarget_idx);
4768
4769 Qcoding_system_history = intern ("coding-system-history");
4770 staticpro (&Qcoding_system_history);
4771 Fset (Qcoding_system_history, Qnil);
4772
4773 /* Target FILENAME is the first argument. */
4774 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
4775 /* Target FILENAME is the third argument. */
4776 Fput (Qwrite_region, Qtarget_idx, make_number (2));
4777
4778 Qcall_process = intern ("call-process");
4779 staticpro (&Qcall_process);
4780 /* Target PROGRAM is the first argument. */
4781 Fput (Qcall_process, Qtarget_idx, make_number (0));
4782
4783 Qcall_process_region = intern ("call-process-region");
4784 staticpro (&Qcall_process_region);
4785 /* Target PROGRAM is the third argument. */
4786 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
4787
4788 Qstart_process = intern ("start-process");
4789 staticpro (&Qstart_process);
4790 /* Target PROGRAM is the third argument. */
4791 Fput (Qstart_process, Qtarget_idx, make_number (2));
4792
4793 Qopen_network_stream = intern ("open-network-stream");
4794 staticpro (&Qopen_network_stream);
4795 /* Target SERVICE is the fourth argument. */
4796 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
4797
4798 Qcoding_system = intern ("coding-system");
4799 staticpro (&Qcoding_system);
4800
4801 Qeol_type = intern ("eol-type");
4802 staticpro (&Qeol_type);
4803
4804 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
4805 staticpro (&Qbuffer_file_coding_system);
4806
4807 Qpost_read_conversion = intern ("post-read-conversion");
4808 staticpro (&Qpost_read_conversion);
4809
4810 Qpre_write_conversion = intern ("pre-write-conversion");
4811 staticpro (&Qpre_write_conversion);
4812
4813 Qno_conversion = intern ("no-conversion");
4814 staticpro (&Qno_conversion);
4815
4816 Qundecided = intern ("undecided");
4817 staticpro (&Qundecided);
4818
4819 Qcoding_system_p = intern ("coding-system-p");
4820 staticpro (&Qcoding_system_p);
4821
4822 Qcoding_system_error = intern ("coding-system-error");
4823 staticpro (&Qcoding_system_error);
4824
4825 Fput (Qcoding_system_error, Qerror_conditions,
4826 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
4827 Fput (Qcoding_system_error, Qerror_message,
4828 build_string ("Invalid coding system"));
4829
4830 Qcoding_category = intern ("coding-category");
4831 staticpro (&Qcoding_category);
4832 Qcoding_category_index = intern ("coding-category-index");
4833 staticpro (&Qcoding_category_index);
4834
4835 Vcoding_category_table
4836 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
4837 staticpro (&Vcoding_category_table);
4838 {
4839 int i;
4840 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4841 {
4842 XVECTOR (Vcoding_category_table)->contents[i]
4843 = intern (coding_category_name[i]);
4844 Fput (XVECTOR (Vcoding_category_table)->contents[i],
4845 Qcoding_category_index, make_number (i));
4846 }
4847 }
4848
4849 Qcharacter_unification_table = intern ("character-unification-table");
4850 staticpro (&Qcharacter_unification_table);
4851 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
4852 make_number (0));
4853
4854 Qcharacter_unification_table_for_decode
4855 = intern ("character-unification-table-for-decode");
4856 staticpro (&Qcharacter_unification_table_for_decode);
4857
4858 Qcharacter_unification_table_for_encode
4859 = intern ("character-unification-table-for-encode");
4860 staticpro (&Qcharacter_unification_table_for_encode);
4861
4862 Qsafe_charsets = intern ("safe-charsets");
4863 staticpro (&Qsafe_charsets);
4864
4865 Qemacs_mule = intern ("emacs-mule");
4866 staticpro (&Qemacs_mule);
4867
4868 Qraw_text = intern ("raw-text");
4869 staticpro (&Qraw_text);
4870
4871 defsubr (&Scoding_system_p);
4872 defsubr (&Sread_coding_system);
4873 defsubr (&Sread_non_nil_coding_system);
4874 defsubr (&Scheck_coding_system);
4875 defsubr (&Sdetect_coding_region);
4876 defsubr (&Sdetect_coding_string);
4877 defsubr (&Sdecode_coding_region);
4878 defsubr (&Sencode_coding_region);
4879 defsubr (&Sdecode_coding_string);
4880 defsubr (&Sencode_coding_string);
4881 defsubr (&Sdecode_sjis_char);
4882 defsubr (&Sencode_sjis_char);
4883 defsubr (&Sdecode_big5_char);
4884 defsubr (&Sencode_big5_char);
4885 defsubr (&Sset_terminal_coding_system_internal);
4886 defsubr (&Sset_safe_terminal_coding_system_internal);
4887 defsubr (&Sterminal_coding_system);
4888 defsubr (&Sset_keyboard_coding_system_internal);
4889 defsubr (&Skeyboard_coding_system);
4890 defsubr (&Sfind_operation_coding_system);
4891 defsubr (&Supdate_iso_coding_systems);
4892
4893 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
4894 "List of coding systems.\n\
4895 \n\
4896 Do not alter the value of this variable manually. This variable should be\n\
4897 updated by the functions `make-coding-system' and\n\
4898 `define-coding-system-alias'.");
4899 Vcoding_system_list = Qnil;
4900
4901 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
4902 "Alist of coding system names.\n\
4903 Each element is one element list of coding system name.\n\
4904 This variable is given to `completing-read' as TABLE argument.\n\
4905 \n\
4906 Do not alter the value of this variable manually. This variable should be\n\
4907 updated by the functions `make-coding-system' and\n\
4908 `define-coding-system-alias'.");
4909 Vcoding_system_alist = Qnil;
4910
4911 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
4912 "List of coding-categories (symbols) ordered by priority.");
4913 {
4914 int i;
4915
4916 Vcoding_category_list = Qnil;
4917 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4918 Vcoding_category_list
4919 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
4920 Vcoding_category_list);
4921 }
4922
4923 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4924 "Specify the coding system for read operations.\n\
4925 It is useful to bind this variable with `let', but do not set it globally.\n\
4926 If the value is a coding system, it is used for decoding on read operation.\n\
4927 If not, an appropriate element is used from one of the coding system alists:\n\
4928 There are three such tables, `file-coding-system-alist',\n\
4929 `process-coding-system-alist', and `network-coding-system-alist'.");
4930 Vcoding_system_for_read = Qnil;
4931
4932 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4933 "Specify the coding system for write operations.\n\
4934 It is useful to bind this variable with `let', but do not set it globally.\n\
4935 If the value is a coding system, it is used for encoding on write operation.\n\
4936 If not, an appropriate element is used from one of the coding system alists:\n\
4937 There are three such tables, `file-coding-system-alist',\n\
4938 `process-coding-system-alist', and `network-coding-system-alist'.");
4939 Vcoding_system_for_write = Qnil;
4940
4941 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4942 "Coding system used in the latest file or process I/O.");
4943 Vlast_coding_system_used = Qnil;
4944
4945 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4946 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4947 inhibit_eol_conversion = 0;
4948
4949 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4950 "Alist to decide a coding system to use for a file I/O operation.\n\
4951 The format is ((PATTERN . VAL) ...),\n\
4952 where PATTERN is a regular expression matching a file name,\n\
4953 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4954 If VAL is a coding system, it is used for both decoding and encoding\n\
4955 the file contents.\n\
4956 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4957 and the cdr part is used for encoding.\n\
4958 If VAL is a function symbol, the function must return a coding system\n\
4959 or a cons of coding systems which are used as above.\n\
4960 \n\
4961 See also the function `find-operation-coding-system'.");
4962 Vfile_coding_system_alist = Qnil;
4963
4964 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4965 "Alist to decide a coding system to use for a process I/O operation.\n\
4966 The format is ((PATTERN . VAL) ...),\n\
4967 where PATTERN is a regular expression matching a program name,\n\
4968 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4969 If VAL is a coding system, it is used for both decoding what received\n\
4970 from the program and encoding what sent to the program.\n\
4971 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4972 and the cdr part is used for encoding.\n\
4973 If VAL is a function symbol, the function must return a coding system\n\
4974 or a cons of coding systems which are used as above.\n\
4975 \n\
4976 See also the function `find-operation-coding-system'.");
4977 Vprocess_coding_system_alist = Qnil;
4978
4979 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4980 "Alist to decide a coding system to use for a network I/O operation.\n\
4981 The format is ((PATTERN . VAL) ...),\n\
4982 where PATTERN is a regular expression matching a network service name\n\
4983 or is a port number to connect to,\n\
4984 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4985 If VAL is a coding system, it is used for both decoding what received\n\
4986 from the network stream and encoding what sent to the network stream.\n\
4987 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4988 and the cdr part is used for encoding.\n\
4989 If VAL is a function symbol, the function must return a coding system\n\
4990 or a cons of coding systems which are used as above.\n\
4991 \n\
4992 See also the function `find-operation-coding-system'.");
4993 Vnetwork_coding_system_alist = Qnil;
4994
4995 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4996 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4997 eol_mnemonic_unix = ':';
4998
4999 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5000 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5001 eol_mnemonic_dos = '\\';
5002
5003 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5004 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5005 eol_mnemonic_mac = '/';
5006
5007 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5008 "Mnemonic character indicating end-of-line format is not yet decided.");
5009 eol_mnemonic_undecided = ':';
5010
5011 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
5012 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5013 Venable_character_unification = Qt;
5014
5015 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5016 &Vstandard_character_unification_table_for_decode,
5017 "Table for unifying characters when reading.");
5018 Vstandard_character_unification_table_for_decode = Qnil;
5019
5020 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5021 &Vstandard_character_unification_table_for_encode,
5022 "Table for unifying characters when writing.");
5023 Vstandard_character_unification_table_for_encode = Qnil;
5024
5025 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5026 "Alist of charsets vs revision numbers.\n\
5027 While encoding, if a charset (car part of an element) is found,\n\
5028 designate it with the escape sequence identifing revision (cdr part of the element).");
5029 Vcharset_revision_alist = Qnil;
5030
5031 DEFVAR_LISP ("default-process-coding-system",
5032 &Vdefault_process_coding_system,
5033 "Cons of coding systems used for process I/O by default.\n\
5034 The car part is used for decoding a process output,\n\
5035 the cdr part is used for encoding a text to be sent to a process.");
5036 Vdefault_process_coding_system = Qnil;
5037
5038 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5039 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5040 This is a vector of length 256.\n\
5041 If Nth element is non-nil, the existence of code N in a file\n\
5042 \(or output of subprocess) doesn't prevent it to be detected as\n\
5043 a coding system of ISO 2022 variant which has a flag\n\
5044 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5045 or reading output of a subprocess.\n\
5046 Only 128th through 159th elements has a meaning.");
5047 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5048
5049 DEFVAR_LISP ("select-safe-coding-system-function",
5050 &Vselect_safe_coding_system_function,
5051 "Function to call to select safe coding system for encoding a text.\n\
5052 \n\
5053 If set, this function is called to force a user to select a proper\n\
5054 coding system which can encode the text in the case that a default\n\
5055 coding system used in each operation can't encode the text.\n\
5056 \n\
5057 The default value is `select-safe-codign-system' (which see).");
5058 Vselect_safe_coding_system_function = Qnil;
5059
5060 }
5061
5062 #endif /* emacs */