]> code.delx.au - gnu-emacs/blob - src/coding.c
(Fset_terminal_coding_system_internal): Set
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Other
71
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
81
82 */
83
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
85
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
90
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
95
96 */
97
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105 #if 0
106 int
107 detect_coding_emacs_mule (src, src_end)
108 unsigned char *src, *src_end;
109 {
110 ...
111 }
112 #endif
113
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
122 #if 0
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128 {
129 ...
130 }
131 #endif
132
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
141 #if 0
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147 {
148 ...
149 }
150 #endif
151
152 /*** COMMONLY USED MACROS ***/
153
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160 #define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168 #define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176 #define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191 /* Decode one ASCII character C. */
192
193 #define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
203
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
218
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225 \f
226 /*** 1. Preamble ***/
227
228 #include <stdio.h>
229
230 #ifdef emacs
231
232 #include <config.h>
233 #include "lisp.h"
234 #include "buffer.h"
235 #include "charset.h"
236 #include "ccl.h"
237 #include "coding.h"
238 #include "window.h"
239
240 #else /* not emacs */
241
242 #include "mulelib.h"
243
244 #endif /* not emacs */
245
246 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx;
254
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257 /* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259 int eol_mnemonic_undecided;
260
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263 int system_eol_type;
264
265 #ifdef emacs
266
267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
268
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule;
271
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used;
280
281 /* A vector of length 256 which contains information about special
282 Microsoft codes. */
283 Lisp_Object Vmicrosoft_code_table;
284
285 /* Flag to inhibit code conversion of end-of-line format. */
286 int inhibit_eol_conversion;
287
288 /* Coding system to be used to encode text for terminal display. */
289 struct coding_system terminal_coding;
290
291 /* Coding system to be used to encode text for terminal display when
292 terminal coding system is nil. */
293 struct coding_system safe_terminal_coding;
294
295 /* Coding system of what is sent from terminal keyboard. */
296 struct coding_system keyboard_coding;
297
298 Lisp_Object Vfile_coding_system_alist;
299 Lisp_Object Vprocess_coding_system_alist;
300 Lisp_Object Vnetwork_coding_system_alist;
301
302 #endif /* emacs */
303
304 Lisp_Object Qcoding_category_index;
305
306 /* List of symbols `coding-category-xxx' ordered by priority. */
307 Lisp_Object Vcoding_category_list;
308
309 /* Table of coding-systems currently assigned to each coding-category. */
310 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
311
312 /* Table of names of symbol for each coding-category. */
313 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
314 "coding-category-emacs-mule",
315 "coding-category-sjis",
316 "coding-category-iso-7",
317 "coding-category-iso-8-1",
318 "coding-category-iso-8-2",
319 "coding-category-iso-7-else",
320 "coding-category-iso-8-else",
321 "coding-category-big5",
322 "coding-category-binary"
323 };
324
325 /* Flag to tell if we look up unification table on character code
326 conversion. */
327 Lisp_Object Venable_character_unification;
328 /* Standard unification table to look up on decoding (reading). */
329 Lisp_Object Vstandard_character_unification_table_for_decode;
330 /* Standard unification table to look up on encoding (writing). */
331 Lisp_Object Vstandard_character_unification_table_for_encode;
332
333 Lisp_Object Qcharacter_unification_table;
334 Lisp_Object Qcharacter_unification_table_for_decode;
335 Lisp_Object Qcharacter_unification_table_for_encode;
336
337 /* Alist of charsets vs revision number. */
338 Lisp_Object Vcharset_revision_alist;
339
340 /* Default coding systems used for process I/O. */
341 Lisp_Object Vdefault_process_coding_system;
342
343 \f
344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
345
346 /* Emacs' internal format for encoding multiple character sets is a
347 kind of multi-byte encoding, i.e. characters are encoded by
348 variable-length sequences of one-byte codes. ASCII characters
349 and control characters (e.g. `tab', `newline') are represented by
350 one-byte sequences which are their ASCII codes, in the range 0x00
351 through 0x7F. The other characters are represented by a sequence
352 of `base leading-code', optional `extended leading-code', and one
353 or two `position-code's. The length of the sequence is determined
354 by the base leading-code. Leading-code takes the range 0x80
355 through 0x9F, whereas extended leading-code and position-code take
356 the range 0xA0 through 0xFF. See `charset.h' for more details
357 about leading-code and position-code.
358
359 There's one exception to this rule. Special leading-code
360 `leading-code-composition' denotes that the following several
361 characters should be composed into one character. Leading-codes of
362 components (except for ASCII) are added 0x20. An ASCII character
363 component is represented by a 2-byte sequence of `0xA0' and
364 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
365 details of composite character. Hence, we can summarize the code
366 range as follows:
367
368 --- CODE RANGE of Emacs' internal format ---
369 (character set) (range)
370 ASCII 0x00 .. 0x7F
371 ELSE (1st byte) 0x80 .. 0x9F
372 (rest bytes) 0xA0 .. 0xFF
373 ---------------------------------------------
374
375 */
376
377 enum emacs_code_class_type emacs_code_class[256];
378
379 /* Go to the next statement only if *SRC is accessible and the code is
380 greater than 0xA0. */
381 #define CHECK_CODE_RANGE_A0_FF \
382 do { \
383 if (src >= src_end) \
384 goto label_end_of_switch; \
385 else if (*src++ < 0xA0) \
386 return 0; \
387 } while (0)
388
389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
390 Check if a text is encoded in Emacs' internal format. If it is,
391 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
392
393 int
394 detect_coding_emacs_mule (src, src_end)
395 unsigned char *src, *src_end;
396 {
397 unsigned char c;
398 int composing = 0;
399
400 while (src < src_end)
401 {
402 c = *src++;
403
404 if (composing)
405 {
406 if (c < 0xA0)
407 composing = 0;
408 else
409 c -= 0x20;
410 }
411
412 switch (emacs_code_class[c])
413 {
414 case EMACS_ascii_code:
415 case EMACS_linefeed_code:
416 break;
417
418 case EMACS_control_code:
419 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
420 return 0;
421 break;
422
423 case EMACS_invalid_code:
424 return 0;
425
426 case EMACS_leading_code_composition: /* c == 0x80 */
427 if (composing)
428 CHECK_CODE_RANGE_A0_FF;
429 else
430 composing = 1;
431 break;
432
433 case EMACS_leading_code_4:
434 CHECK_CODE_RANGE_A0_FF;
435 /* fall down to check it two more times ... */
436
437 case EMACS_leading_code_3:
438 CHECK_CODE_RANGE_A0_FF;
439 /* fall down to check it one more time ... */
440
441 case EMACS_leading_code_2:
442 CHECK_CODE_RANGE_A0_FF;
443 break;
444
445 default:
446 label_end_of_switch:
447 break;
448 }
449 }
450 return CODING_CATEGORY_MASK_EMACS_MULE;
451 }
452
453 \f
454 /*** 3. ISO2022 handlers ***/
455
456 /* The following note describes the coding system ISO2022 briefly.
457 Since the intention of this note is to help in understanding of
458 the programs in this file, some parts are NOT ACCURATE or OVERLY
459 SIMPLIFIED. For the thorough understanding, please refer to the
460 original document of ISO2022.
461
462 ISO2022 provides many mechanisms to encode several character sets
463 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
464 all text is encoded by codes of less than 128. This may make the
465 encoded text a little bit longer, but the text gets more stability
466 to pass through several gateways (some of them strip off the MSB).
467
468 There are two kinds of character set: control character set and
469 graphic character set. The former contains control characters such
470 as `newline' and `escape' to provide control functions (control
471 functions are provided also by escape sequences). The latter
472 contains graphic characters such as ' A' and '-'. Emacs recognizes
473 two control character sets and many graphic character sets.
474
475 Graphic character sets are classified into one of the following
476 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
477 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
478 bytes (DIMENSION) and the number of characters in one dimension
479 (CHARS) of the set. In addition, each character set is assigned an
480 identification tag (called "final character" and denoted as <F>
481 here after) which is unique in each class. <F> of each character
482 set is decided by ECMA(*) when it is registered in ISO. Code range
483 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
484
485 Note (*): ECMA = European Computer Manufacturers Association
486
487 Here are examples of graphic character set [NAME(<F>)]:
488 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
489 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
490 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
491 o DIMENSION2_CHARS96 -- none for the moment
492
493 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
494 C0 [0x00..0x1F] -- control character plane 0
495 GL [0x20..0x7F] -- graphic character plane 0
496 C1 [0x80..0x9F] -- control character plane 1
497 GR [0xA0..0xFF] -- graphic character plane 1
498
499 A control character set is directly designated and invoked to C0 or
500 C1 by an escape sequence. The most common case is that ISO646's
501 control character set is designated/invoked to C0 and ISO6429's
502 control character set is designated/invoked to C1, and usually
503 these designations/invocations are omitted in a coded text. With
504 7-bit environment, only C0 can be used, and a control character for
505 C1 is encoded by an appropriate escape sequence to fit in the
506 environment. All control characters for C1 are defined the
507 corresponding escape sequences.
508
509 A graphic character set is at first designated to one of four
510 graphic registers (G0 through G3), then these graphic registers are
511 invoked to GL or GR. These designations and invocations can be
512 done independently. The most common case is that G0 is invoked to
513 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
514 these invocations and designations are omitted in a coded text.
515 With 7-bit environment, only GL can be used.
516
517 When a graphic character set of CHARS94 is invoked to GL, code 0x20
518 and 0x7F of GL area work as control characters SPACE and DEL
519 respectively, and code 0xA0 and 0xFF of GR area should not be used.
520
521 There are two ways of invocation: locking-shift and single-shift.
522 With locking-shift, the invocation lasts until the next different
523 invocation, whereas with single-shift, the invocation works only
524 for the following character and doesn't affect locking-shift.
525 Invocations are done by the following control characters or escape
526 sequences.
527
528 ----------------------------------------------------------------------
529 function control char escape sequence description
530 ----------------------------------------------------------------------
531 SI (shift-in) 0x0F none invoke G0 to GL
532 SO (shift-out) 0x0E none invoke G1 to GL
533 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
534 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
535 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
536 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
537 ----------------------------------------------------------------------
538 The first four are for locking-shift. Control characters for these
539 functions are defined by macros ISO_CODE_XXX in `coding.h'.
540
541 Designations are done by the following escape sequences.
542 ----------------------------------------------------------------------
543 escape sequence description
544 ----------------------------------------------------------------------
545 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
546 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
547 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
548 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
549 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
550 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
551 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
552 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
553 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
554 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
555 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
556 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
557 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
558 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
559 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
560 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
561 ----------------------------------------------------------------------
562
563 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
564 of dimension 1, chars 94, and final character <F>, and etc.
565
566 Note (*): Although these designations are not allowed in ISO2022,
567 Emacs accepts them on decoding, and produces them on encoding
568 CHARS96 character set in a coding system which is characterized as
569 7-bit environment, non-locking-shift, and non-single-shift.
570
571 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
572 '(' can be omitted. We call this as "short-form" here after.
573
574 Now you may notice that there are a lot of ways for encoding the
575 same multilingual text in ISO2022. Actually, there exists many
576 coding systems such as Compound Text (used in X's inter client
577 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
578 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
579 localized platforms), and all of these are variants of ISO2022.
580
581 In addition to the above, Emacs handles two more kinds of escape
582 sequences: ISO6429's direction specification and Emacs' private
583 sequence for specifying character composition.
584
585 ISO6429's direction specification takes the following format:
586 o CSI ']' -- end of the current direction
587 o CSI '0' ']' -- end of the current direction
588 o CSI '1' ']' -- start of left-to-right text
589 o CSI '2' ']' -- start of right-to-left text
590 The control character CSI (0x9B: control sequence introducer) is
591 abbreviated to the escape sequence ESC '[' in 7-bit environment.
592
593 Character composition specification takes the following format:
594 o ESC '0' -- start character composition
595 o ESC '1' -- end character composition
596 Since these are not standard escape sequences of any ISO, the use
597 of them for these meaning is restricted to Emacs only. */
598
599 enum iso_code_class_type iso_code_class[256];
600
601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
602 Check if a text is encoded in ISO2022. If it is, returns an
603 integer in which appropriate flag bits any of:
604 CODING_CATEGORY_MASK_ISO_7
605 CODING_CATEGORY_MASK_ISO_8_1
606 CODING_CATEGORY_MASK_ISO_8_2
607 CODING_CATEGORY_MASK_ISO_7_ELSE
608 CODING_CATEGORY_MASK_ISO_8_ELSE
609 are set. If a code which should never appear in ISO2022 is found,
610 returns 0. */
611
612 int
613 detect_coding_iso2022 (src, src_end)
614 unsigned char *src, *src_end;
615 {
616 int mask = (CODING_CATEGORY_MASK_ISO_7
617 | CODING_CATEGORY_MASK_ISO_8_1
618 | CODING_CATEGORY_MASK_ISO_8_2
619 | CODING_CATEGORY_MASK_ISO_7_ELSE
620 | CODING_CATEGORY_MASK_ISO_8_ELSE
621 );
622 int g1 = 0; /* 1 iff designating to G1. */
623 int c, i;
624
625 while (src < src_end)
626 {
627 c = *src++;
628 switch (c)
629 {
630 case ISO_CODE_ESC:
631 if (src >= src_end)
632 break;
633 c = *src++;
634 if ((c >= '(' && c <= '/'))
635 {
636 /* Designation sequence for a charset of dimension 1. */
637 if (src >= src_end)
638 break;
639 c = *src++;
640 if (c < ' ' || c >= 0x80)
641 /* Invalid designation sequence. */
642 return 0;
643 }
644 else if (c == '$')
645 {
646 /* Designation sequence for a charset of dimension 2. */
647 if (src >= src_end)
648 break;
649 c = *src++;
650 if (c >= '@' && c <= 'B')
651 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
652 ;
653 else if (c >= '(' && c <= '/')
654 {
655 if (src >= src_end)
656 break;
657 c = *src++;
658 if (c < ' ' || c >= 0x80)
659 /* Invalid designation sequence. */
660 return 0;
661 }
662 else
663 /* Invalid designation sequence. */
664 return 0;
665 }
666 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
667 /* Locking shift. */
668 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
669 | CODING_CATEGORY_MASK_ISO_8_ELSE);
670 else if (c == '0' || c == '1' || c == '2')
671 /* Start/end composition. */
672 ;
673 else
674 /* Invalid escape sequence. */
675 return 0;
676 break;
677
678 case ISO_CODE_SO:
679 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
680 | CODING_CATEGORY_MASK_ISO_8_ELSE);
681 break;
682
683 case ISO_CODE_CSI:
684 case ISO_CODE_SS2:
685 case ISO_CODE_SS3:
686 return CODING_CATEGORY_MASK_ISO_8_ELSE;
687
688 default:
689 if (c < 0x80)
690 break;
691 else if (c < 0xA0)
692 {
693 if (VECTORP (Vmicrosoft_code_table)
694 && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
695 {
696 mask &= ~(CODING_CATEGORY_MASK_ISO_7
697 | CODING_CATEGORY_MASK_ISO_7_ELSE);
698 break;
699 }
700 return 0;
701 }
702 else
703 {
704 unsigned char *src_begin = src;
705
706 mask &= ~(CODING_CATEGORY_MASK_ISO_7
707 | CODING_CATEGORY_MASK_ISO_7_ELSE);
708 while (src < src_end && *src >= 0xA0)
709 src++;
710 if ((src - src_begin - 1) & 1 && src < src_end)
711 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
712 }
713 break;
714 }
715 }
716
717 return mask;
718 }
719
720 /* Decode a character of which charset is CHARSET and the 1st position
721 code is C1. If dimension of CHARSET is 2, the 2nd position code is
722 fetched from SRC and set to C2. If CHARSET is negative, it means
723 that we are decoding ill formed text, and what we can do is just to
724 read C1 as is. */
725
726 #define DECODE_ISO_CHARACTER(charset, c1) \
727 do { \
728 int c_alt, charset_alt = (charset); \
729 if (COMPOSING_HEAD_P (coding->composing)) \
730 { \
731 *dst++ = LEADING_CODE_COMPOSITION; \
732 if (COMPOSING_WITH_RULE_P (coding->composing)) \
733 /* To tell composition rules are embeded. */ \
734 *dst++ = 0xFF; \
735 coding->composing += 2; \
736 } \
737 if ((charset) >= 0) \
738 { \
739 if (CHARSET_DIMENSION (charset) == 2) \
740 ONE_MORE_BYTE (c2); \
741 if (!NILP (unification_table) \
742 && ((c_alt = unify_char (unification_table, \
743 -1, (charset), c1, c2)) >= 0)) \
744 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
745 } \
746 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
747 DECODE_CHARACTER_ASCII (c1); \
748 else if (CHARSET_DIMENSION (charset_alt) == 1) \
749 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
750 else \
751 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
752 if (COMPOSING_WITH_RULE_P (coding->composing)) \
753 /* To tell a composition rule follows. */ \
754 coding->composing = COMPOSING_WITH_RULE_RULE; \
755 } while (0)
756
757 /* Set designation state into CODING. */
758 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
759 do { \
760 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
761 make_number (chars), \
762 make_number (final_char)); \
763 if (charset >= 0) \
764 { \
765 if (coding->direction == 1 \
766 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
767 charset = CHARSET_REVERSE_CHARSET (charset); \
768 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
769 } \
770 } while (0)
771
772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
773
774 int
775 decode_coding_iso2022 (coding, source, destination,
776 src_bytes, dst_bytes, consumed)
777 struct coding_system *coding;
778 unsigned char *source, *destination;
779 int src_bytes, dst_bytes;
780 int *consumed;
781 {
782 unsigned char *src = source;
783 unsigned char *src_end = source + src_bytes;
784 unsigned char *dst = destination;
785 unsigned char *dst_end = destination + dst_bytes;
786 /* Since the maximum bytes produced by each loop is 7, we subtract 6
787 from DST_END to assure that overflow checking is necessary only
788 at the head of loop. */
789 unsigned char *adjusted_dst_end = dst_end - 6;
790 int charset;
791 /* Charsets invoked to graphic plane 0 and 1 respectively. */
792 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
793 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
794 Lisp_Object unification_table
795 = coding->character_unification_table_for_decode;
796
797 if (!NILP (Venable_character_unification) && NILP (unification_table))
798 unification_table = Vstandard_character_unification_table_for_decode;
799
800 while (src < src_end && dst < adjusted_dst_end)
801 {
802 /* SRC_BASE remembers the start position in source in each loop.
803 The loop will be exited when there's not enough source text
804 to analyze long escape sequence or 2-byte code (within macros
805 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
806 to SRC_BASE before exiting. */
807 unsigned char *src_base = src;
808 int c1 = *src++, c2;
809
810 switch (iso_code_class [c1])
811 {
812 case ISO_0x20_or_0x7F:
813 if (!coding->composing
814 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
815 {
816 /* This is SPACE or DEL. */
817 *dst++ = c1;
818 break;
819 }
820 /* This is a graphic character, we fall down ... */
821
822 case ISO_graphic_plane_0:
823 if (coding->composing == COMPOSING_WITH_RULE_RULE)
824 {
825 /* This is a composition rule. */
826 *dst++ = c1 | 0x80;
827 coding->composing = COMPOSING_WITH_RULE_TAIL;
828 }
829 else
830 DECODE_ISO_CHARACTER (charset0, c1);
831 break;
832
833 case ISO_0xA0_or_0xFF:
834 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
835 {
836 /* Invalid code. */
837 *dst++ = c1;
838 break;
839 }
840 /* This is a graphic character, we fall down ... */
841
842 case ISO_graphic_plane_1:
843 DECODE_ISO_CHARACTER (charset1, c1);
844 break;
845
846 case ISO_control_code:
847 /* All ISO2022 control characters in this class have the
848 same representation in Emacs internal format. */
849 *dst++ = c1;
850 break;
851
852 case ISO_carriage_return:
853 if (coding->eol_type == CODING_EOL_CR)
854 {
855 *dst++ = '\n';
856 }
857 else if (coding->eol_type == CODING_EOL_CRLF)
858 {
859 ONE_MORE_BYTE (c1);
860 if (c1 == ISO_CODE_LF)
861 *dst++ = '\n';
862 else
863 {
864 src--;
865 *dst++ = c1;
866 }
867 }
868 else
869 {
870 *dst++ = c1;
871 }
872 break;
873
874 case ISO_shift_out:
875 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
876 goto label_invalid_escape_sequence;
877 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
878 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
879 break;
880
881 case ISO_shift_in:
882 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
883 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
884 break;
885
886 case ISO_single_shift_2_7:
887 case ISO_single_shift_2:
888 /* SS2 is handled as an escape sequence of ESC 'N' */
889 c1 = 'N';
890 goto label_escape_sequence;
891
892 case ISO_single_shift_3:
893 /* SS2 is handled as an escape sequence of ESC 'O' */
894 c1 = 'O';
895 goto label_escape_sequence;
896
897 case ISO_control_sequence_introducer:
898 /* CSI is handled as an escape sequence of ESC '[' ... */
899 c1 = '[';
900 goto label_escape_sequence;
901
902 case ISO_escape:
903 ONE_MORE_BYTE (c1);
904 label_escape_sequence:
905 /* Escape sequences handled by Emacs are invocation,
906 designation, direction specification, and character
907 composition specification. */
908 switch (c1)
909 {
910 case '&': /* revision of following character set */
911 ONE_MORE_BYTE (c1);
912 if (!(c1 >= '@' && c1 <= '~'))
913 goto label_invalid_escape_sequence;
914 ONE_MORE_BYTE (c1);
915 if (c1 != ISO_CODE_ESC)
916 goto label_invalid_escape_sequence;
917 ONE_MORE_BYTE (c1);
918 goto label_escape_sequence;
919
920 case '$': /* designation of 2-byte character set */
921 ONE_MORE_BYTE (c1);
922 if (c1 >= '@' && c1 <= 'B')
923 { /* designation of JISX0208.1978, GB2312.1980,
924 or JISX0208.1980 */
925 DECODE_DESIGNATION (0, 2, 94, c1);
926 }
927 else if (c1 >= 0x28 && c1 <= 0x2B)
928 { /* designation of DIMENSION2_CHARS94 character set */
929 ONE_MORE_BYTE (c2);
930 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
931 }
932 else if (c1 >= 0x2C && c1 <= 0x2F)
933 { /* designation of DIMENSION2_CHARS96 character set */
934 ONE_MORE_BYTE (c2);
935 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
936 }
937 else
938 goto label_invalid_escape_sequence;
939 break;
940
941 case 'n': /* invocation of locking-shift-2 */
942 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
943 goto label_invalid_escape_sequence;
944 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
945 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
946 break;
947
948 case 'o': /* invocation of locking-shift-3 */
949 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
950 goto label_invalid_escape_sequence;
951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
953 break;
954
955 case 'N': /* invocation of single-shift-2 */
956 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
957 goto label_invalid_escape_sequence;
958 ONE_MORE_BYTE (c1);
959 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
960 DECODE_ISO_CHARACTER (charset, c1);
961 break;
962
963 case 'O': /* invocation of single-shift-3 */
964 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
965 goto label_invalid_escape_sequence;
966 ONE_MORE_BYTE (c1);
967 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
968 DECODE_ISO_CHARACTER (charset, c1);
969 break;
970
971 case '0': /* start composing without embeded rules */
972 coding->composing = COMPOSING_NO_RULE_HEAD;
973 break;
974
975 case '1': /* end composing */
976 coding->composing = COMPOSING_NO;
977 break;
978
979 case '2': /* start composing with embeded rules */
980 coding->composing = COMPOSING_WITH_RULE_HEAD;
981 break;
982
983 case '[': /* specification of direction */
984 /* For the moment, nested direction is not supported.
985 So, the value of `coding->direction' is 0 or 1: 0
986 means left-to-right, 1 means right-to-left. */
987 ONE_MORE_BYTE (c1);
988 switch (c1)
989 {
990 case ']': /* end of the current direction */
991 coding->direction = 0;
992
993 case '0': /* end of the current direction */
994 case '1': /* start of left-to-right direction */
995 ONE_MORE_BYTE (c1);
996 if (c1 == ']')
997 coding->direction = 0;
998 else
999 goto label_invalid_escape_sequence;
1000 break;
1001
1002 case '2': /* start of right-to-left direction */
1003 ONE_MORE_BYTE (c1);
1004 if (c1 == ']')
1005 coding->direction= 1;
1006 else
1007 goto label_invalid_escape_sequence;
1008 break;
1009
1010 default:
1011 goto label_invalid_escape_sequence;
1012 }
1013 break;
1014
1015 default:
1016 if (c1 >= 0x28 && c1 <= 0x2B)
1017 { /* designation of DIMENSION1_CHARS94 character set */
1018 ONE_MORE_BYTE (c2);
1019 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1020 }
1021 else if (c1 >= 0x2C && c1 <= 0x2F)
1022 { /* designation of DIMENSION1_CHARS96 character set */
1023 ONE_MORE_BYTE (c2);
1024 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1025 }
1026 else
1027 {
1028 goto label_invalid_escape_sequence;
1029 }
1030 }
1031 /* We must update these variables now. */
1032 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1033 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1034 break;
1035
1036 label_invalid_escape_sequence:
1037 {
1038 int length = src - src_base;
1039
1040 bcopy (src_base, dst, length);
1041 dst += length;
1042 }
1043 }
1044 continue;
1045
1046 label_end_of_loop:
1047 coding->carryover_size = src - src_base;
1048 bcopy (src_base, coding->carryover, coding->carryover_size);
1049 src = src_base;
1050 break;
1051 }
1052
1053 /* If this is the last block of the text to be decoded, we had
1054 better just flush out all remaining codes in the text although
1055 they are not valid characters. */
1056 if (coding->last_block)
1057 {
1058 bcopy (src, dst, src_end - src);
1059 dst += (src_end - src);
1060 src = src_end;
1061 }
1062 *consumed = src - source;
1063 return dst - destination;
1064 }
1065
1066 /* ISO2022 encoding stuff. */
1067
1068 /*
1069 It is not enough to say just "ISO2022" on encoding, we have to
1070 specify more details. In Emacs, each coding-system of ISO2022
1071 variant has the following specifications:
1072 1. Initial designation to G0 thru G3.
1073 2. Allows short-form designation?
1074 3. ASCII should be designated to G0 before control characters?
1075 4. ASCII should be designated to G0 at end of line?
1076 5. 7-bit environment or 8-bit environment?
1077 6. Use locking-shift?
1078 7. Use Single-shift?
1079 And the following two are only for Japanese:
1080 8. Use ASCII in place of JIS0201-1976-Roman?
1081 9. Use JISX0208-1983 in place of JISX0208-1978?
1082 These specifications are encoded in `coding->flags' as flag bits
1083 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1084 details.
1085 */
1086
1087 /* Produce codes (escape sequence) for designating CHARSET to graphic
1088 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1089 the coding system CODING allows, produce designation sequence of
1090 short-form. */
1091
1092 #define ENCODE_DESIGNATION(charset, reg, coding) \
1093 do { \
1094 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1095 char *intermediate_char_94 = "()*+"; \
1096 char *intermediate_char_96 = ",-./"; \
1097 Lisp_Object temp \
1098 = Fassq (make_number (charset), Vcharset_revision_alist); \
1099 if (! NILP (temp)) \
1100 { \
1101 *dst++ = ISO_CODE_ESC; \
1102 *dst++ = '&'; \
1103 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1104 } \
1105 *dst++ = ISO_CODE_ESC; \
1106 if (CHARSET_DIMENSION (charset) == 1) \
1107 { \
1108 if (CHARSET_CHARS (charset) == 94) \
1109 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1110 else \
1111 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1112 } \
1113 else \
1114 { \
1115 *dst++ = '$'; \
1116 if (CHARSET_CHARS (charset) == 94) \
1117 { \
1118 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1119 || reg != 0 \
1120 || final_char < '@' || final_char > 'B') \
1121 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1122 } \
1123 else \
1124 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1125 } \
1126 *dst++ = final_char; \
1127 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1128 } while (0)
1129
1130 /* The following two macros produce codes (control character or escape
1131 sequence) for ISO2022 single-shift functions (single-shift-2 and
1132 single-shift-3). */
1133
1134 #define ENCODE_SINGLE_SHIFT_2 \
1135 do { \
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1137 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1138 else \
1139 *dst++ = ISO_CODE_SS2; \
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1141 } while (0)
1142
1143 #define ENCODE_SINGLE_SHIFT_3 \
1144 do { \
1145 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1146 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1147 else \
1148 *dst++ = ISO_CODE_SS3; \
1149 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1150 } while (0)
1151
1152 /* The following four macros produce codes (control character or
1153 escape sequence) for ISO2022 locking-shift functions (shift-in,
1154 shift-out, locking-shift-2, and locking-shift-3). */
1155
1156 #define ENCODE_SHIFT_IN \
1157 do { \
1158 *dst++ = ISO_CODE_SI; \
1159 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1160 } while (0)
1161
1162 #define ENCODE_SHIFT_OUT \
1163 do { \
1164 *dst++ = ISO_CODE_SO; \
1165 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1166 } while (0)
1167
1168 #define ENCODE_LOCKING_SHIFT_2 \
1169 do { \
1170 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1171 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1172 } while (0)
1173
1174 #define ENCODE_LOCKING_SHIFT_3 \
1175 do { \
1176 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1177 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1178 } while (0)
1179
1180 /* Produce codes for a DIMENSION1 character whose character set is
1181 CHARSET and whose position-code is C1. Designation and invocation
1182 sequences are also produced in advance if necessary. */
1183
1184
1185 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1186 do { \
1187 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1188 { \
1189 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1190 *dst++ = c1 & 0x7F; \
1191 else \
1192 *dst++ = c1 | 0x80; \
1193 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1194 break; \
1195 } \
1196 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1197 { \
1198 *dst++ = c1 & 0x7F; \
1199 break; \
1200 } \
1201 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1202 { \
1203 *dst++ = c1 | 0x80; \
1204 break; \
1205 } \
1206 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1207 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1208 { \
1209 /* We should not encode this character, instead produce one or \
1210 two `?'s. */ \
1211 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1212 if (CHARSET_WIDTH (charset) == 2) \
1213 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1214 break; \
1215 } \
1216 else \
1217 /* Since CHARSET is not yet invoked to any graphic planes, we \
1218 must invoke it, or, at first, designate it to some graphic \
1219 register. Then repeat the loop to actually produce the \
1220 character. */ \
1221 dst = encode_invocation_designation (charset, coding, dst); \
1222 } while (1)
1223
1224 /* Produce codes for a DIMENSION2 character whose character set is
1225 CHARSET and whose position-codes are C1 and C2. Designation and
1226 invocation codes are also produced in advance if necessary. */
1227
1228 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1229 do { \
1230 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1231 { \
1232 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1233 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1234 else \
1235 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1236 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1237 break; \
1238 } \
1239 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1240 { \
1241 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1242 break; \
1243 } \
1244 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1245 { \
1246 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1247 break; \
1248 } \
1249 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1250 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1251 { \
1252 /* We should not encode this character, instead produce one or \
1253 two `?'s. */ \
1254 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1255 if (CHARSET_WIDTH (charset) == 2) \
1256 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1257 break; \
1258 } \
1259 else \
1260 /* Since CHARSET is not yet invoked to any graphic planes, we \
1261 must invoke it, or, at first, designate it to some graphic \
1262 register. Then repeat the loop to actually produce the \
1263 character. */ \
1264 dst = encode_invocation_designation (charset, coding, dst); \
1265 } while (1)
1266
1267 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1268 do { \
1269 int c_alt, charset_alt; \
1270 if (!NILP (unification_table) \
1271 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1272 >= 0)) \
1273 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1274 else \
1275 charset_alt = charset; \
1276 if (CHARSET_DIMENSION (charset_alt) == 1) \
1277 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1278 else \
1279 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1280 } while (0)
1281
1282 /* Produce designation and invocation codes at a place pointed by DST
1283 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1284 Return new DST. */
1285
1286 unsigned char *
1287 encode_invocation_designation (charset, coding, dst)
1288 int charset;
1289 struct coding_system *coding;
1290 unsigned char *dst;
1291 {
1292 int reg; /* graphic register number */
1293
1294 /* At first, check designations. */
1295 for (reg = 0; reg < 4; reg++)
1296 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1297 break;
1298
1299 if (reg >= 4)
1300 {
1301 /* CHARSET is not yet designated to any graphic registers. */
1302 /* At first check the requested designation. */
1303 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1304 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1305 /* Since CHARSET requests no special designation, designate it
1306 to graphic register 0. */
1307 reg = 0;
1308
1309 ENCODE_DESIGNATION (charset, reg, coding);
1310 }
1311
1312 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1313 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1314 {
1315 /* Since the graphic register REG is not invoked to any graphic
1316 planes, invoke it to graphic plane 0. */
1317 switch (reg)
1318 {
1319 case 0: /* graphic register 0 */
1320 ENCODE_SHIFT_IN;
1321 break;
1322
1323 case 1: /* graphic register 1 */
1324 ENCODE_SHIFT_OUT;
1325 break;
1326
1327 case 2: /* graphic register 2 */
1328 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1329 ENCODE_SINGLE_SHIFT_2;
1330 else
1331 ENCODE_LOCKING_SHIFT_2;
1332 break;
1333
1334 case 3: /* graphic register 3 */
1335 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1336 ENCODE_SINGLE_SHIFT_3;
1337 else
1338 ENCODE_LOCKING_SHIFT_3;
1339 break;
1340 }
1341 }
1342 return dst;
1343 }
1344
1345 /* The following two macros produce codes for indicating composition. */
1346 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1347 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1348 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1349
1350 /* The following three macros produce codes for indicating direction
1351 of text. */
1352 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1353 do { \
1354 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1355 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1356 else \
1357 *dst++ = ISO_CODE_CSI; \
1358 } while (0)
1359
1360 #define ENCODE_DIRECTION_R2L \
1361 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1362
1363 #define ENCODE_DIRECTION_L2R \
1364 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1365
1366 /* Produce codes for designation and invocation to reset the graphic
1367 planes and registers to initial state. */
1368 #define ENCODE_RESET_PLANE_AND_REGISTER \
1369 do { \
1370 int reg; \
1371 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1372 ENCODE_SHIFT_IN; \
1373 for (reg = 0; reg < 4; reg++) \
1374 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1375 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1376 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1377 ENCODE_DESIGNATION \
1378 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1379 } while (0)
1380
1381 /* Produce designation sequences of charsets in the line started from
1382 *SRC to a place pointed by DSTP.
1383
1384 If the current block ends before any end-of-line, we may fail to
1385 find all the necessary *designations. */
1386 encode_designation_at_bol (coding, table, src, src_end, dstp)
1387 struct coding_system *coding;
1388 Lisp_Object table;
1389 unsigned char *src, *src_end, **dstp;
1390 {
1391 int charset, c, found = 0, reg;
1392 /* Table of charsets to be designated to each graphic register. */
1393 int r[4];
1394 unsigned char *dst = *dstp;
1395
1396 for (reg = 0; reg < 4; reg++)
1397 r[reg] = -1;
1398
1399 while (src < src_end && *src != '\n' && found < 4)
1400 {
1401 int bytes = BYTES_BY_CHAR_HEAD (*src);
1402
1403 if (NILP (table))
1404 charset = CHARSET_AT (src);
1405 else
1406 {
1407 int c_alt, c1, c2;
1408
1409 SPLIT_STRING(src, bytes, charset, c1, c2);
1410 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1411 charset = CHAR_CHARSET (c_alt);
1412 }
1413
1414 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1415 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1416 {
1417 found++;
1418 r[reg] = charset;
1419 }
1420
1421 src += bytes;
1422 }
1423
1424 if (found)
1425 {
1426 for (reg = 0; reg < 4; reg++)
1427 if (r[reg] >= 0
1428 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1429 ENCODE_DESIGNATION (r[reg], reg, coding);
1430 *dstp = dst;
1431 }
1432 }
1433
1434 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1435
1436 int
1437 encode_coding_iso2022 (coding, source, destination,
1438 src_bytes, dst_bytes, consumed)
1439 struct coding_system *coding;
1440 unsigned char *source, *destination;
1441 int src_bytes, dst_bytes;
1442 int *consumed;
1443 {
1444 unsigned char *src = source;
1445 unsigned char *src_end = source + src_bytes;
1446 unsigned char *dst = destination;
1447 unsigned char *dst_end = destination + dst_bytes;
1448 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1449 from DST_END to assure overflow checking is necessary only at the
1450 head of loop. */
1451 unsigned char *adjusted_dst_end = dst_end - 19;
1452 Lisp_Object unification_table
1453 = coding->character_unification_table_for_encode;
1454
1455 if (!NILP (Venable_character_unification) && NILP (unification_table))
1456 unification_table = Vstandard_character_unification_table_for_encode;
1457
1458 while (src < src_end && dst < adjusted_dst_end)
1459 {
1460 /* SRC_BASE remembers the start position in source in each loop.
1461 The loop will be exited when there's not enough source text
1462 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1463 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1464 reset to SRC_BASE before exiting. */
1465 unsigned char *src_base = src;
1466 int charset, c1, c2, c3, c4;
1467
1468 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1469 && CODING_SPEC_ISO_BOL (coding))
1470 {
1471 /* We have to produce designation sequences if any now. */
1472 encode_designation_at_bol (coding, unification_table,
1473 src, src_end, &dst);
1474 CODING_SPEC_ISO_BOL (coding) = 0;
1475 }
1476
1477 c1 = *src++;
1478 /* If we are seeing a component of a composite character, we are
1479 seeing a leading-code specially encoded for composition, or a
1480 composition rule if composing with rule. We must set C1
1481 to a normal leading-code or an ASCII code. If we are not at
1482 a composed character, we must reset the composition state. */
1483 if (COMPOSING_P (coding->composing))
1484 {
1485 if (c1 < 0xA0)
1486 {
1487 /* We are not in a composite character any longer. */
1488 coding->composing = COMPOSING_NO;
1489 ENCODE_COMPOSITION_END;
1490 }
1491 else
1492 {
1493 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1494 {
1495 *dst++ = c1 & 0x7F;
1496 coding->composing = COMPOSING_WITH_RULE_HEAD;
1497 continue;
1498 }
1499 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1500 coding->composing = COMPOSING_WITH_RULE_RULE;
1501 if (c1 == 0xA0)
1502 {
1503 /* This is an ASCII component. */
1504 ONE_MORE_BYTE (c1);
1505 c1 &= 0x7F;
1506 }
1507 else
1508 /* This is a leading-code of non ASCII component. */
1509 c1 -= 0x20;
1510 }
1511 }
1512
1513 /* Now encode one character. C1 is a control character, an
1514 ASCII character, or a leading-code of multi-byte character. */
1515 switch (emacs_code_class[c1])
1516 {
1517 case EMACS_ascii_code:
1518 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1519 break;
1520
1521 case EMACS_control_code:
1522 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1523 ENCODE_RESET_PLANE_AND_REGISTER;
1524 *dst++ = c1;
1525 break;
1526
1527 case EMACS_carriage_return_code:
1528 if (!coding->selective)
1529 {
1530 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1531 ENCODE_RESET_PLANE_AND_REGISTER;
1532 *dst++ = c1;
1533 break;
1534 }
1535 /* fall down to treat '\r' as '\n' ... */
1536
1537 case EMACS_linefeed_code:
1538 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1539 ENCODE_RESET_PLANE_AND_REGISTER;
1540 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1541 bcopy (coding->spec.iso2022.initial_designation,
1542 coding->spec.iso2022.current_designation,
1543 sizeof coding->spec.iso2022.initial_designation);
1544 if (coding->eol_type == CODING_EOL_LF
1545 || coding->eol_type == CODING_EOL_UNDECIDED)
1546 *dst++ = ISO_CODE_LF;
1547 else if (coding->eol_type == CODING_EOL_CRLF)
1548 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1549 else
1550 *dst++ = ISO_CODE_CR;
1551 CODING_SPEC_ISO_BOL (coding) = 1;
1552 break;
1553
1554 case EMACS_leading_code_2:
1555 ONE_MORE_BYTE (c2);
1556 if (c2 < 0xA0)
1557 {
1558 /* invalid sequence */
1559 *dst++ = c1;
1560 *dst++ = c2;
1561 }
1562 else
1563 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1564 break;
1565
1566 case EMACS_leading_code_3:
1567 TWO_MORE_BYTES (c2, c3);
1568 if (c2 < 0xA0 || c3 < 0xA0)
1569 {
1570 /* invalid sequence */
1571 *dst++ = c1;
1572 *dst++ = c2;
1573 *dst++ = c3;
1574 }
1575 else if (c1 < LEADING_CODE_PRIVATE_11)
1576 ENCODE_ISO_CHARACTER (c1, c2, c3);
1577 else
1578 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1579 break;
1580
1581 case EMACS_leading_code_4:
1582 THREE_MORE_BYTES (c2, c3, c4);
1583 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1584 {
1585 /* invalid sequence */
1586 *dst++ = c1;
1587 *dst++ = c2;
1588 *dst++ = c3;
1589 *dst++ = c4;
1590 }
1591 else
1592 ENCODE_ISO_CHARACTER (c2, c3, c4);
1593 break;
1594
1595 case EMACS_leading_code_composition:
1596 ONE_MORE_BYTE (c2);
1597 if (c2 < 0xA0)
1598 {
1599 /* invalid sequence */
1600 *dst++ = c1;
1601 *dst++ = c2;
1602 }
1603 else if (c2 == 0xFF)
1604 {
1605 coding->composing = COMPOSING_WITH_RULE_HEAD;
1606 ENCODE_COMPOSITION_WITH_RULE_START;
1607 }
1608 else
1609 {
1610 /* Rewind one byte because it is a character code of
1611 composition elements. */
1612 src--;
1613 coding->composing = COMPOSING_NO_RULE_HEAD;
1614 ENCODE_COMPOSITION_NO_RULE_START;
1615 }
1616 break;
1617
1618 case EMACS_invalid_code:
1619 *dst++ = c1;
1620 break;
1621 }
1622 continue;
1623 label_end_of_loop:
1624 /* We reach here because the source date ends not at character
1625 boundary. */
1626 coding->carryover_size = src_end - src_base;
1627 bcopy (src_base, coding->carryover, coding->carryover_size);
1628 src = src_end;
1629 break;
1630 }
1631
1632 /* If this is the last block of the text to be encoded, we must
1633 reset graphic planes and registers to the initial state. */
1634 if (src >= src_end && coding->last_block)
1635 {
1636 ENCODE_RESET_PLANE_AND_REGISTER;
1637 if (coding->carryover_size > 0
1638 && coding->carryover_size < (dst_end - dst))
1639 {
1640 bcopy (coding->carryover, dst, coding->carryover_size);
1641 dst += coding->carryover_size;
1642 coding->carryover_size = 0;
1643 }
1644 }
1645 *consumed = src - source;
1646 return dst - destination;
1647 }
1648
1649 \f
1650 /*** 4. SJIS and BIG5 handlers ***/
1651
1652 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1653 quite widely. So, for the moment, Emacs supports them in the bare
1654 C code. But, in the future, they may be supported only by CCL. */
1655
1656 /* SJIS is a coding system encoding three character sets: ASCII, right
1657 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1658 as is. A character of charset katakana-jisx0201 is encoded by
1659 "position-code + 0x80". A character of charset japanese-jisx0208
1660 is encoded in 2-byte but two position-codes are divided and shifted
1661 so that it fit in the range below.
1662
1663 --- CODE RANGE of SJIS ---
1664 (character set) (range)
1665 ASCII 0x00 .. 0x7F
1666 KATAKANA-JISX0201 0xA0 .. 0xDF
1667 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1668 (2nd byte) 0x40 .. 0xFF
1669 -------------------------------
1670
1671 */
1672
1673 /* BIG5 is a coding system encoding two character sets: ASCII and
1674 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1675 character set and is encoded in two-byte.
1676
1677 --- CODE RANGE of BIG5 ---
1678 (character set) (range)
1679 ASCII 0x00 .. 0x7F
1680 Big5 (1st byte) 0xA1 .. 0xFE
1681 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1682 --------------------------
1683
1684 Since the number of characters in Big5 is larger than maximum
1685 characters in Emacs' charset (96x96), it can't be handled as one
1686 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1687 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1688 contains frequently used characters and the latter contains less
1689 frequently used characters. */
1690
1691 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1692 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1693 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1694 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1695
1696 /* Number of Big5 characters which have the same code in 1st byte. */
1697 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1698
1699 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1700 do { \
1701 unsigned int temp \
1702 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1703 if (b1 < 0xC9) \
1704 charset = charset_big5_1; \
1705 else \
1706 { \
1707 charset = charset_big5_2; \
1708 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1709 } \
1710 c1 = temp / (0xFF - 0xA1) + 0x21; \
1711 c2 = temp % (0xFF - 0xA1) + 0x21; \
1712 } while (0)
1713
1714 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1715 do { \
1716 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1717 if (charset == charset_big5_2) \
1718 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1719 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1720 b2 = temp % BIG5_SAME_ROW; \
1721 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1722 } while (0)
1723
1724 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1725 do { \
1726 int c_alt, charset_alt = (charset); \
1727 if (!NILP (unification_table) \
1728 && ((c_alt = unify_char (unification_table, \
1729 -1, (charset), c1, c2)) >= 0)) \
1730 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1731 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1732 DECODE_CHARACTER_ASCII (c1); \
1733 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1734 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1735 else \
1736 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1737 } while (0)
1738
1739 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1740 do { \
1741 int c_alt, charset_alt; \
1742 if (!NILP (unification_table) \
1743 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1744 >= 0)) \
1745 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1746 else \
1747 charset_alt = charset; \
1748 if (charset_alt == charset_ascii) \
1749 *dst++ = c1; \
1750 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1751 { \
1752 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1753 *dst++ = c1; \
1754 else \
1755 *dst++ = charset_alt, *dst++ = c1; \
1756 } \
1757 else \
1758 { \
1759 c1 &= 0x7F, c2 &= 0x7F; \
1760 if (sjis_p && charset_alt == charset_jisx0208) \
1761 { \
1762 unsigned char s1, s2; \
1763 \
1764 ENCODE_SJIS (c1, c2, s1, s2); \
1765 *dst++ = s1, *dst++ = s2; \
1766 } \
1767 else if (!sjis_p \
1768 && (charset_alt == charset_big5_1 \
1769 || charset_alt == charset_big5_2)) \
1770 { \
1771 unsigned char b1, b2; \
1772 \
1773 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1774 *dst++ = b1, *dst++ = b2; \
1775 } \
1776 else \
1777 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1778 } \
1779 } while (0);
1780
1781 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1782 Check if a text is encoded in SJIS. If it is, return
1783 CODING_CATEGORY_MASK_SJIS, else return 0. */
1784
1785 int
1786 detect_coding_sjis (src, src_end)
1787 unsigned char *src, *src_end;
1788 {
1789 unsigned char c;
1790
1791 while (src < src_end)
1792 {
1793 c = *src++;
1794 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1795 return 0;
1796 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1797 {
1798 if (src < src_end && *src++ < 0x40)
1799 return 0;
1800 }
1801 }
1802 return CODING_CATEGORY_MASK_SJIS;
1803 }
1804
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806 Check if a text is encoded in BIG5. If it is, return
1807 CODING_CATEGORY_MASK_BIG5, else return 0. */
1808
1809 int
1810 detect_coding_big5 (src, src_end)
1811 unsigned char *src, *src_end;
1812 {
1813 unsigned char c;
1814
1815 while (src < src_end)
1816 {
1817 c = *src++;
1818 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1819 return 0;
1820 if (c >= 0xA1)
1821 {
1822 if (src >= src_end)
1823 break;
1824 c = *src++;
1825 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1826 return 0;
1827 }
1828 }
1829 return CODING_CATEGORY_MASK_BIG5;
1830 }
1831
1832 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1833 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1834
1835 int
1836 decode_coding_sjis_big5 (coding, source, destination,
1837 src_bytes, dst_bytes, consumed, sjis_p)
1838 struct coding_system *coding;
1839 unsigned char *source, *destination;
1840 int src_bytes, dst_bytes;
1841 int *consumed;
1842 int sjis_p;
1843 {
1844 unsigned char *src = source;
1845 unsigned char *src_end = source + src_bytes;
1846 unsigned char *dst = destination;
1847 unsigned char *dst_end = destination + dst_bytes;
1848 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1849 from DST_END to assure overflow checking is necessary only at the
1850 head of loop. */
1851 unsigned char *adjusted_dst_end = dst_end - 3;
1852 Lisp_Object unification_table
1853 = coding->character_unification_table_for_decode;
1854
1855 if (!NILP (Venable_character_unification) && NILP (unification_table))
1856 unification_table = Vstandard_character_unification_table_for_decode;
1857
1858 while (src < src_end && dst < adjusted_dst_end)
1859 {
1860 /* SRC_BASE remembers the start position in source in each loop.
1861 The loop will be exited when there's not enough source text
1862 to analyze two-byte character (within macro ONE_MORE_BYTE).
1863 In that case, SRC is reset to SRC_BASE before exiting. */
1864 unsigned char *src_base = src;
1865 unsigned char c1 = *src++, c2, c3, c4;
1866
1867 if (c1 == '\r')
1868 {
1869 if (coding->eol_type == CODING_EOL_CRLF)
1870 {
1871 ONE_MORE_BYTE (c2);
1872 if (c2 == '\n')
1873 *dst++ = c2;
1874 else
1875 /* To process C2 again, SRC is subtracted by 1. */
1876 *dst++ = c1, src--;
1877 }
1878 else
1879 *dst++ = c1;
1880 }
1881 else if (c1 < 0x20)
1882 *dst++ = c1;
1883 else if (c1 < 0x80)
1884 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1885 else if (c1 < 0xA0 || c1 >= 0xE0)
1886 {
1887 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1888 if (sjis_p)
1889 {
1890 ONE_MORE_BYTE (c2);
1891 DECODE_SJIS (c1, c2, c3, c4);
1892 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1893 }
1894 else if (c1 >= 0xE0 && c1 < 0xFF)
1895 {
1896 int charset;
1897
1898 ONE_MORE_BYTE (c2);
1899 DECODE_BIG5 (c1, c2, charset, c3, c4);
1900 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1901 }
1902 else /* Invalid code */
1903 *dst++ = c1;
1904 }
1905 else
1906 {
1907 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1908 if (sjis_p)
1909 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1910 else
1911 {
1912 int charset;
1913
1914 ONE_MORE_BYTE (c2);
1915 DECODE_BIG5 (c1, c2, charset, c3, c4);
1916 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1917 }
1918 }
1919 continue;
1920
1921 label_end_of_loop:
1922 coding->carryover_size = src - src_base;
1923 bcopy (src_base, coding->carryover, coding->carryover_size);
1924 src = src_base;
1925 break;
1926 }
1927
1928 *consumed = src - source;
1929 return dst - destination;
1930 }
1931
1932 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1933 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1934 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1935 sure that all these charsets are registered as official charset
1936 (i.e. do not have extended leading-codes). Characters of other
1937 charsets are produced without any encoding. If SJIS_P is 1, encode
1938 SJIS text, else encode BIG5 text. */
1939
1940 int
1941 encode_coding_sjis_big5 (coding, source, destination,
1942 src_bytes, dst_bytes, consumed, sjis_p)
1943 struct coding_system *coding;
1944 unsigned char *source, *destination;
1945 int src_bytes, dst_bytes;
1946 int *consumed;
1947 int sjis_p;
1948 {
1949 unsigned char *src = source;
1950 unsigned char *src_end = source + src_bytes;
1951 unsigned char *dst = destination;
1952 unsigned char *dst_end = destination + dst_bytes;
1953 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1954 from DST_END to assure overflow checking is necessary only at the
1955 head of loop. */
1956 unsigned char *adjusted_dst_end = dst_end - 1;
1957 Lisp_Object unification_table
1958 = coding->character_unification_table_for_encode;
1959
1960 if (!NILP (Venable_character_unification) && NILP (unification_table))
1961 unification_table = Vstandard_character_unification_table_for_encode;
1962
1963 while (src < src_end && dst < adjusted_dst_end)
1964 {
1965 /* SRC_BASE remembers the start position in source in each loop.
1966 The loop will be exited when there's not enough source text
1967 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1968 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1969 before exiting. */
1970 unsigned char *src_base = src;
1971 unsigned char c1 = *src++, c2, c3, c4;
1972
1973 if (coding->composing)
1974 {
1975 if (c1 == 0xA0)
1976 {
1977 ONE_MORE_BYTE (c1);
1978 c1 &= 0x7F;
1979 }
1980 else if (c1 >= 0xA0)
1981 c1 -= 0x20;
1982 else
1983 coding->composing = 0;
1984 }
1985
1986 switch (emacs_code_class[c1])
1987 {
1988 case EMACS_ascii_code:
1989 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1990 break;
1991
1992 case EMACS_control_code:
1993 *dst++ = c1;
1994 break;
1995
1996 case EMACS_carriage_return_code:
1997 if (!coding->selective)
1998 {
1999 *dst++ = c1;
2000 break;
2001 }
2002 /* fall down to treat '\r' as '\n' ... */
2003
2004 case EMACS_linefeed_code:
2005 if (coding->eol_type == CODING_EOL_LF
2006 || coding->eol_type == CODING_EOL_UNDECIDED)
2007 *dst++ = '\n';
2008 else if (coding->eol_type == CODING_EOL_CRLF)
2009 *dst++ = '\r', *dst++ = '\n';
2010 else
2011 *dst++ = '\r';
2012 break;
2013
2014 case EMACS_leading_code_2:
2015 ONE_MORE_BYTE (c2);
2016 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2017 break;
2018
2019 case EMACS_leading_code_3:
2020 TWO_MORE_BYTES (c2, c3);
2021 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2022 break;
2023
2024 case EMACS_leading_code_4:
2025 THREE_MORE_BYTES (c2, c3, c4);
2026 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2027 break;
2028
2029 case EMACS_leading_code_composition:
2030 coding->composing = 1;
2031 break;
2032
2033 default: /* i.e. case EMACS_invalid_code: */
2034 *dst++ = c1;
2035 }
2036 continue;
2037
2038 label_end_of_loop:
2039 coding->carryover_size = src_end - src_base;
2040 bcopy (src_base, coding->carryover, coding->carryover_size);
2041 src = src_end;
2042 break;
2043 }
2044
2045 *consumed = src - source;
2046 return dst - destination;
2047 }
2048
2049 \f
2050 /*** 5. End-of-line handlers ***/
2051
2052 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2053 This function is called only when `coding->eol_type' is
2054 CODING_EOL_CRLF or CODING_EOL_CR. */
2055
2056 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2057 struct coding_system *coding;
2058 unsigned char *source, *destination;
2059 int src_bytes, dst_bytes;
2060 int *consumed;
2061 {
2062 unsigned char *src = source;
2063 unsigned char *src_end = source + src_bytes;
2064 unsigned char *dst = destination;
2065 unsigned char *dst_end = destination + dst_bytes;
2066 int produced;
2067
2068 switch (coding->eol_type)
2069 {
2070 case CODING_EOL_CRLF:
2071 {
2072 /* Since the maximum bytes produced by each loop is 2, we
2073 subtract 1 from DST_END to assure overflow checking is
2074 necessary only at the head of loop. */
2075 unsigned char *adjusted_dst_end = dst_end - 1;
2076
2077 while (src < src_end && dst < adjusted_dst_end)
2078 {
2079 unsigned char *src_base = src;
2080 unsigned char c = *src++;
2081 if (c == '\r')
2082 {
2083 ONE_MORE_BYTE (c);
2084 if (c != '\n')
2085 *dst++ = '\r';
2086 *dst++ = c;
2087 }
2088 else
2089 *dst++ = c;
2090 continue;
2091
2092 label_end_of_loop:
2093 coding->carryover_size = src - src_base;
2094 bcopy (src_base, coding->carryover, coding->carryover_size);
2095 src = src_base;
2096 break;
2097 }
2098 *consumed = src - source;
2099 produced = dst - destination;
2100 break;
2101 }
2102
2103 case CODING_EOL_CR:
2104 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2105 bcopy (source, destination, produced);
2106 dst_end = destination + produced;
2107 while (dst < dst_end)
2108 if (*dst++ == '\r') dst[-1] = '\n';
2109 *consumed = produced;
2110 break;
2111
2112 default: /* i.e. case: CODING_EOL_LF */
2113 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2114 bcopy (source, destination, produced);
2115 *consumed = produced;
2116 break;
2117 }
2118
2119 return produced;
2120 }
2121
2122 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2123 format of end-of-line according to `coding->eol_type'. If
2124 `coding->selective' is 1, code '\r' in source text also means
2125 end-of-line. */
2126
2127 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2128 struct coding_system *coding;
2129 unsigned char *source, *destination;
2130 int src_bytes, dst_bytes;
2131 int *consumed;
2132 {
2133 unsigned char *src = source;
2134 unsigned char *dst = destination;
2135 int produced;
2136
2137 if (src_bytes <= 0)
2138 return 0;
2139
2140 switch (coding->eol_type)
2141 {
2142 case CODING_EOL_LF:
2143 case CODING_EOL_UNDECIDED:
2144 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2145 bcopy (source, destination, produced);
2146 if (coding->selective)
2147 {
2148 int i = produced;
2149 while (i--)
2150 if (*dst++ == '\r') dst[-1] = '\n';
2151 }
2152 *consumed = produced;
2153
2154 case CODING_EOL_CRLF:
2155 {
2156 unsigned char c;
2157 unsigned char *src_end = source + src_bytes;
2158 unsigned char *dst_end = destination + dst_bytes;
2159 /* Since the maximum bytes produced by each loop is 2, we
2160 subtract 1 from DST_END to assure overflow checking is
2161 necessary only at the head of loop. */
2162 unsigned char *adjusted_dst_end = dst_end - 1;
2163
2164 while (src < src_end && dst < adjusted_dst_end)
2165 {
2166 c = *src++;
2167 if (c == '\n' || (c == '\r' && coding->selective))
2168 *dst++ = '\r', *dst++ = '\n';
2169 else
2170 *dst++ = c;
2171 }
2172 produced = dst - destination;
2173 *consumed = src - source;
2174 break;
2175 }
2176
2177 default: /* i.e. case CODING_EOL_CR: */
2178 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2179 bcopy (source, destination, produced);
2180 {
2181 int i = produced;
2182 while (i--)
2183 if (*dst++ == '\n') dst[-1] = '\r';
2184 }
2185 *consumed = produced;
2186 }
2187
2188 return produced;
2189 }
2190
2191 \f
2192 /*** 6. C library functions ***/
2193
2194 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2195 has a property `coding-system'. The value of this property is a
2196 vector of length 5 (called as coding-vector). Among elements of
2197 this vector, the first (element[0]) and the fifth (element[4])
2198 carry important information for decoding/encoding. Before
2199 decoding/encoding, this information should be set in fields of a
2200 structure of type `coding_system'.
2201
2202 A value of property `coding-system' can be a symbol of another
2203 subsidiary coding-system. In that case, Emacs gets coding-vector
2204 from that symbol.
2205
2206 `element[0]' contains information to be set in `coding->type'. The
2207 value and its meaning is as follows:
2208
2209 0 -- coding_type_emacs_mule
2210 1 -- coding_type_sjis
2211 2 -- coding_type_iso2022
2212 3 -- coding_type_big5
2213 4 -- coding_type_ccl encoder/decoder written in CCL
2214 nil -- coding_type_no_conversion
2215 t -- coding_type_undecided (automatic conversion on decoding,
2216 no-conversion on encoding)
2217
2218 `element[4]' contains information to be set in `coding->flags' and
2219 `coding->spec'. The meaning varies by `coding->type'.
2220
2221 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2222 of length 32 (of which the first 13 sub-elements are used now).
2223 Meanings of these sub-elements are:
2224
2225 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2226 If the value is an integer of valid charset, the charset is
2227 assumed to be designated to graphic register N initially.
2228
2229 If the value is minus, it is a minus value of charset which
2230 reserves graphic register N, which means that the charset is
2231 not designated initially but should be designated to graphic
2232 register N just before encoding a character in that charset.
2233
2234 If the value is nil, graphic register N is never used on
2235 encoding.
2236
2237 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2238 Each value takes t or nil. See the section ISO2022 of
2239 `coding.h' for more information.
2240
2241 If `coding->type' is `coding_type_big5', element[4] is t to denote
2242 BIG5-ETen or nil to denote BIG5-HKU.
2243
2244 If `coding->type' takes the other value, element[4] is ignored.
2245
2246 Emacs Lisp's coding system also carries information about format of
2247 end-of-line in a value of property `eol-type'. If the value is
2248 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2249 means CODING_EOL_CR. If it is not integer, it should be a vector
2250 of subsidiary coding systems of which property `eol-type' has one
2251 of above values.
2252
2253 */
2254
2255 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2256 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2257 is setup so that no conversion is necessary and return -1, else
2258 return 0. */
2259
2260 int
2261 setup_coding_system (coding_system, coding)
2262 Lisp_Object coding_system;
2263 struct coding_system *coding;
2264 {
2265 Lisp_Object type, eol_type;
2266
2267 /* At first, set several fields to default values. */
2268 coding->require_flushing = 0;
2269 coding->last_block = 0;
2270 coding->selective = 0;
2271 coding->composing = 0;
2272 coding->direction = 0;
2273 coding->carryover_size = 0;
2274 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2275 coding->character_unification_table_for_decode = Qnil;
2276 coding->character_unification_table_for_encode = Qnil;
2277
2278 Vlast_coding_system_used = coding->symbol = coding_system;
2279 eol_type = Qnil;
2280 /* Get value of property `coding-system' until we get a vector.
2281 While doing that, also get values of properties
2282 `post-read-conversion', `pre-write-conversion',
2283 `character-unification-table-for-decode',
2284 `character-unification-table-for-encode' and `eol-type'. */
2285 while (!NILP (coding_system) && SYMBOLP (coding_system))
2286 {
2287 if (NILP (coding->post_read_conversion))
2288 coding->post_read_conversion = Fget (coding_system,
2289 Qpost_read_conversion);
2290 if (NILP (coding->pre_write_conversion))
2291 coding->pre_write_conversion = Fget (coding_system,
2292 Qpre_write_conversion);
2293 if (!inhibit_eol_conversion && NILP (eol_type))
2294 eol_type = Fget (coding_system, Qeol_type);
2295
2296 if (NILP (coding->character_unification_table_for_decode))
2297 coding->character_unification_table_for_decode
2298 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2299
2300 if (NILP (coding->character_unification_table_for_encode))
2301 coding->character_unification_table_for_encode
2302 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2303
2304 coding_system = Fget (coding_system, Qcoding_system);
2305 }
2306
2307 while (!NILP (coding->character_unification_table_for_decode)
2308 && SYMBOLP (coding->character_unification_table_for_decode))
2309 coding->character_unification_table_for_decode
2310 = Fget (coding->character_unification_table_for_decode,
2311 Qcharacter_unification_table_for_decode);
2312 if (!NILP (coding->character_unification_table_for_decode)
2313 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2314 coding->character_unification_table_for_decode = Qnil;
2315
2316 while (!NILP (coding->character_unification_table_for_encode)
2317 && SYMBOLP (coding->character_unification_table_for_encode))
2318 coding->character_unification_table_for_encode
2319 = Fget (coding->character_unification_table_for_encode,
2320 Qcharacter_unification_table_for_encode);
2321 if (!NILP (coding->character_unification_table_for_encode)
2322 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2323 coding->character_unification_table_for_encode = Qnil;
2324
2325 if (!VECTORP (coding_system)
2326 || XVECTOR (coding_system)->size != 5)
2327 goto label_invalid_coding_system;
2328
2329 if (VECTORP (eol_type))
2330 coding->eol_type = CODING_EOL_UNDECIDED;
2331 else if (XFASTINT (eol_type) == 1)
2332 coding->eol_type = CODING_EOL_CRLF;
2333 else if (XFASTINT (eol_type) == 2)
2334 coding->eol_type = CODING_EOL_CR;
2335 else
2336 coding->eol_type = CODING_EOL_LF;
2337
2338 type = XVECTOR (coding_system)->contents[0];
2339 switch (XFASTINT (type))
2340 {
2341 case 0:
2342 coding->type = coding_type_emacs_mule;
2343 break;
2344
2345 case 1:
2346 coding->type = coding_type_sjis;
2347 break;
2348
2349 case 2:
2350 coding->type = coding_type_iso2022;
2351 {
2352 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2353 Lisp_Object *flags;
2354 int i, charset, default_reg_bits = 0;
2355
2356 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2357 goto label_invalid_coding_system;
2358
2359 flags = XVECTOR (val)->contents;
2360 coding->flags
2361 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2362 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2363 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2364 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2365 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2366 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2367 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2368 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2369 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2370 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2371 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2372 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2373 );
2374
2375 /* Invoke graphic register 0 to plane 0. */
2376 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2377 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2378 CODING_SPEC_ISO_INVOCATION (coding, 1)
2379 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2380 /* Not single shifting at first. */
2381 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2382 /* Beginning of buffer should also be regarded as bol. */
2383 CODING_SPEC_ISO_BOL (coding) = 1;
2384
2385 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2386 FLAGS[REG] can be one of below:
2387 integer CHARSET: CHARSET occupies register I,
2388 t: designate nothing to REG initially, but can be used
2389 by any charsets,
2390 list of integer, nil, or t: designate the first
2391 element (if integer) to REG initially, the remaining
2392 elements (if integer) is designated to REG on request,
2393 if an element is t, REG can be used by any charset,
2394 nil: REG is never used. */
2395 for (charset = 0; charset <= MAX_CHARSET; charset++)
2396 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2397 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2398 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2399 for (i = 0; i < 4; i++)
2400 {
2401 if (INTEGERP (flags[i])
2402 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2403 || (charset = get_charset_id (flags[i])) >= 0)
2404 {
2405 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2406 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2407 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2408 }
2409 else if (EQ (flags[i], Qt))
2410 {
2411 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2412 default_reg_bits |= 1 << i;
2413 }
2414 else if (CONSP (flags[i]))
2415 {
2416 Lisp_Object tail = flags[i];
2417
2418 if (INTEGERP (XCONS (tail)->car)
2419 && (charset = XINT (XCONS (tail)->car),
2420 CHARSET_VALID_P (charset))
2421 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2422 {
2423 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2424 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2425 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2426 }
2427 else
2428 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2429 tail = XCONS (tail)->cdr;
2430 while (CONSP (tail))
2431 {
2432 if (INTEGERP (XCONS (tail)->car)
2433 && (charset = XINT (XCONS (tail)->car),
2434 CHARSET_VALID_P (charset))
2435 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2436 {
2437 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2438 = i;
2439 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2440 = 1;
2441 }
2442 else if (EQ (XCONS (tail)->car, Qt))
2443 default_reg_bits |= 1 << i;
2444 tail = XCONS (tail)->cdr;
2445 }
2446 }
2447 else
2448 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2449
2450 CODING_SPEC_ISO_DESIGNATION (coding, i)
2451 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2452 }
2453
2454 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2455 {
2456 /* REG 1 can be used only by locking shift in 7-bit env. */
2457 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2458 default_reg_bits &= ~2;
2459 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2460 /* Without any shifting, only REG 0 and 1 can be used. */
2461 default_reg_bits &= 3;
2462 }
2463
2464 for (charset = 0; charset <= MAX_CHARSET; charset++)
2465 if (CHARSET_VALID_P (charset)
2466 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2467 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2468 {
2469 /* We have not yet decided where to designate CHARSET. */
2470 int reg_bits = default_reg_bits;
2471
2472 if (CHARSET_CHARS (charset) == 96)
2473 /* A charset of CHARS96 can't be designated to REG 0. */
2474 reg_bits &= ~1;
2475
2476 if (reg_bits)
2477 /* There exist some default graphic register. */
2478 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2479 = (reg_bits & 1
2480 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2481 else
2482 /* We anyway have to designate CHARSET to somewhere. */
2483 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2484 = (CHARSET_CHARS (charset) == 94
2485 ? 0
2486 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2487 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2488 ? 1
2489 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2490 ? 2 : 0)));
2491 }
2492 }
2493 coding->require_flushing = 1;
2494 break;
2495
2496 case 3:
2497 coding->type = coding_type_big5;
2498 coding->flags
2499 = (NILP (XVECTOR (coding_system)->contents[4])
2500 ? CODING_FLAG_BIG5_HKU
2501 : CODING_FLAG_BIG5_ETEN);
2502 break;
2503
2504 case 4:
2505 coding->type = coding_type_ccl;
2506 {
2507 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2508 if (CONSP (val)
2509 && VECTORP (XCONS (val)->car)
2510 && VECTORP (XCONS (val)->cdr))
2511 {
2512 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2513 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2514 }
2515 else
2516 goto label_invalid_coding_system;
2517 }
2518 coding->require_flushing = 1;
2519 break;
2520
2521 default:
2522 if (EQ (type, Qt))
2523 coding->type = coding_type_undecided;
2524 else
2525 coding->type = coding_type_no_conversion;
2526 break;
2527 }
2528 return 0;
2529
2530 label_invalid_coding_system:
2531 coding->type = coding_type_no_conversion;
2532 coding->eol_type = CODING_EOL_LF;
2533 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2534 = Qnil;
2535 return -1;
2536 }
2537
2538 /* Emacs has a mechanism to automatically detect a coding system if it
2539 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2540 it's impossible to distinguish some coding systems accurately
2541 because they use the same range of codes. So, at first, coding
2542 systems are categorized into 7, those are:
2543
2544 o coding-category-emacs-mule
2545
2546 The category for a coding system which has the same code range
2547 as Emacs' internal format. Assigned the coding-system (Lisp
2548 symbol) `emacs-mule' by default.
2549
2550 o coding-category-sjis
2551
2552 The category for a coding system which has the same code range
2553 as SJIS. Assigned the coding-system (Lisp
2554 symbol) `japanese-shift-jis' by default.
2555
2556 o coding-category-iso-7
2557
2558 The category for a coding system which has the same code range
2559 as ISO2022 of 7-bit environment. This doesn't use any locking
2560 shift and single shift functions. Assigned the coding-system
2561 (Lisp symbol) `iso-2022-7bit' by default.
2562
2563 o coding-category-iso-8-1
2564
2565 The category for a coding system which has the same code range
2566 as ISO2022 of 8-bit environment and graphic plane 1 used only
2567 for DIMENSION1 charset. This doesn't use any locking shift
2568 and single shift functions. Assigned the coding-system (Lisp
2569 symbol) `iso-latin-1' by default.
2570
2571 o coding-category-iso-8-2
2572
2573 The category for a coding system which has the same code range
2574 as ISO2022 of 8-bit environment and graphic plane 1 used only
2575 for DIMENSION2 charset. This doesn't use any locking shift
2576 and single shift functions. Assigned the coding-system (Lisp
2577 symbol) `japanese-iso-8bit' by default.
2578
2579 o coding-category-iso-7-else
2580
2581 The category for a coding system which has the same code range
2582 as ISO2022 of 7-bit environemnt but uses locking shift or
2583 single shift functions. Assigned the coding-system (Lisp
2584 symbol) `iso-2022-7bit-lock' by default.
2585
2586 o coding-category-iso-8-else
2587
2588 The category for a coding system which has the same code range
2589 as ISO2022 of 8-bit environemnt but uses locking shift or
2590 single shift functions. Assigned the coding-system (Lisp
2591 symbol) `iso-2022-8bit-ss2' by default.
2592
2593 o coding-category-big5
2594
2595 The category for a coding system which has the same code range
2596 as BIG5. Assigned the coding-system (Lisp symbol)
2597 `cn-big5' by default.
2598
2599 o coding-category-binary
2600
2601 The category for a coding system not categorized in any of the
2602 above. Assigned the coding-system (Lisp symbol)
2603 `no-conversion' by default.
2604
2605 Each of them is a Lisp symbol and the value is an actual
2606 `coding-system's (this is also a Lisp symbol) assigned by a user.
2607 What Emacs does actually is to detect a category of coding system.
2608 Then, it uses a `coding-system' assigned to it. If Emacs can't
2609 decide only one possible category, it selects a category of the
2610 highest priority. Priorities of categories are also specified by a
2611 user in a Lisp variable `coding-category-list'.
2612
2613 */
2614
2615 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2616 If it detects possible coding systems, return an integer in which
2617 appropriate flag bits are set. Flag bits are defined by macros
2618 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2619
2620 int
2621 detect_coding_mask (src, src_bytes)
2622 unsigned char *src;
2623 int src_bytes;
2624 {
2625 register unsigned char c;
2626 unsigned char *src_end = src + src_bytes;
2627 int mask;
2628
2629 /* At first, skip all ASCII characters and control characters except
2630 for three ISO2022 specific control characters. */
2631 label_loop_detect_coding:
2632 while (src < src_end)
2633 {
2634 c = *src;
2635 if (c >= 0x80
2636 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2637 break;
2638 src++;
2639 }
2640
2641 if (src >= src_end)
2642 /* We found nothing other than ASCII. There's nothing to do. */
2643 return CODING_CATEGORY_MASK_ANY;
2644
2645 /* The text seems to be encoded in some multilingual coding system.
2646 Now, try to find in which coding system the text is encoded. */
2647 if (c < 0x80)
2648 {
2649 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2650 /* C is an ISO2022 specific control code of C0. */
2651 mask = detect_coding_iso2022 (src, src_end);
2652 src++;
2653 if (mask == CODING_CATEGORY_MASK_ANY)
2654 /* No valid ISO2022 code follows C. Try again. */
2655 goto label_loop_detect_coding;
2656 }
2657 else if (c < 0xA0)
2658 {
2659 /* If C is a special Microsoft code,
2660 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2661 or is an ISO2022 control-sequence-introducer (CSI),
2662 we should also consider the possibility of someof ISO2022 codings. */
2663 if ((VECTORP (Vmicrosoft_code_table)
2664 && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
2665 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2666 || (c == ISO_CODE_CSI
2667 && (src < src_end
2668 && (*src == ']'
2669 || (src + 1 < src_end
2670 && src[1] == ']'
2671 && (*src == '0' || *src == '1' || *src == '2'))))))
2672 mask = (detect_coding_iso2022 (src, src_end)
2673 | detect_coding_sjis (src, src_end)
2674 | detect_coding_emacs_mule (src, src_end)
2675 | CODING_CATEGORY_MASK_BINARY);
2676
2677 else
2678 /* C is the first byte of SJIS character code, or a
2679 leading-code of Emacs. */
2680 mask = (detect_coding_sjis (src, src_end)
2681 | detect_coding_emacs_mule (src, src_end)
2682 | CODING_CATEGORY_MASK_BINARY);
2683 }
2684 else
2685 /* C is a character of ISO2022 in graphic plane right,
2686 or a SJIS's 1-byte character code (i.e. JISX0201),
2687 or the first byte of BIG5's 2-byte code. */
2688 mask = (detect_coding_iso2022 (src, src_end)
2689 | detect_coding_sjis (src, src_end)
2690 | detect_coding_big5 (src, src_end)
2691 | CODING_CATEGORY_MASK_BINARY);
2692
2693 return mask;
2694 }
2695
2696 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2697 The information of the detected coding system is set in CODING. */
2698
2699 void
2700 detect_coding (coding, src, src_bytes)
2701 struct coding_system *coding;
2702 unsigned char *src;
2703 int src_bytes;
2704 {
2705 int mask = detect_coding_mask (src, src_bytes);
2706 int idx;
2707
2708 if (mask == CODING_CATEGORY_MASK_ANY)
2709 /* We found nothing other than ASCII. There's nothing to do. */
2710 return;
2711
2712 if (!mask)
2713 /* The source text seems to be encoded in unknown coding system.
2714 Emacs regards the category of such a kind of coding system as
2715 `coding-category-binary'. We assume that a user has assigned
2716 an appropriate coding system for a `coding-category-binary'. */
2717 idx = CODING_CATEGORY_IDX_BINARY;
2718 else
2719 {
2720 /* We found some plausible coding systems. Let's use a coding
2721 system of the highest priority. */
2722 Lisp_Object val = Vcoding_category_list;
2723
2724 if (CONSP (val))
2725 while (!NILP (val))
2726 {
2727 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2728 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2729 break;
2730 val = XCONS (val)->cdr;
2731 }
2732 else
2733 val = Qnil;
2734
2735 if (NILP (val))
2736 {
2737 /* For unknown reason, `Vcoding_category_list' contains none
2738 of found categories. Let's use any of them. */
2739 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2740 if (mask & (1 << idx))
2741 break;
2742 }
2743 }
2744 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2745 }
2746
2747 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2748 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2749 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2750
2751 #define MAX_EOL_CHECK_COUNT 3
2752
2753 int
2754 detect_eol_type (src, src_bytes)
2755 unsigned char *src;
2756 int src_bytes;
2757 {
2758 unsigned char *src_end = src + src_bytes;
2759 unsigned char c;
2760 int total = 0; /* How many end-of-lines are found so far. */
2761 int eol_type = CODING_EOL_UNDECIDED;
2762 int this_eol_type;
2763
2764 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2765 {
2766 c = *src++;
2767 if (c == '\n' || c == '\r')
2768 {
2769 total++;
2770 if (c == '\n')
2771 this_eol_type = CODING_EOL_LF;
2772 else if (src >= src_end || *src != '\n')
2773 this_eol_type = CODING_EOL_CR;
2774 else
2775 this_eol_type = CODING_EOL_CRLF, src++;
2776
2777 if (eol_type == CODING_EOL_UNDECIDED)
2778 /* This is the first end-of-line. */
2779 eol_type = this_eol_type;
2780 else if (eol_type != this_eol_type)
2781 /* The found type is different from what found before.
2782 We had better not decode end-of-line. */
2783 return CODING_EOL_LF;
2784 }
2785 }
2786
2787 return eol_type;
2788 }
2789
2790 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2791 is encoded. If it detects an appropriate format of end-of-line, it
2792 sets the information in *CODING. */
2793
2794 void
2795 detect_eol (coding, src, src_bytes)
2796 struct coding_system *coding;
2797 unsigned char *src;
2798 int src_bytes;
2799 {
2800 Lisp_Object val;
2801 int eol_type = detect_eol_type (src, src_bytes);
2802
2803 if (eol_type == CODING_EOL_UNDECIDED)
2804 /* We found no end-of-line in the source text. */
2805 return;
2806
2807 val = Fget (coding->symbol, Qeol_type);
2808 if (VECTORP (val) && XVECTOR (val)->size == 3)
2809 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2810 }
2811
2812 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2813 decoding, it may detect coding system and format of end-of-line if
2814 those are not yet decided. */
2815
2816 int
2817 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2818 struct coding_system *coding;
2819 unsigned char *source, *destination;
2820 int src_bytes, dst_bytes;
2821 int *consumed;
2822 {
2823 int produced;
2824
2825 if (src_bytes <= 0)
2826 {
2827 *consumed = 0;
2828 return 0;
2829 }
2830
2831 if (coding->type == coding_type_undecided)
2832 detect_coding (coding, source, src_bytes);
2833
2834 if (coding->eol_type == CODING_EOL_UNDECIDED)
2835 detect_eol (coding, source, src_bytes);
2836
2837 coding->carryover_size = 0;
2838 switch (coding->type)
2839 {
2840 case coding_type_no_conversion:
2841 label_no_conversion:
2842 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2843 bcopy (source, destination, produced);
2844 *consumed = produced;
2845 break;
2846
2847 case coding_type_emacs_mule:
2848 case coding_type_undecided:
2849 if (coding->eol_type == CODING_EOL_LF
2850 || coding->eol_type == CODING_EOL_UNDECIDED)
2851 goto label_no_conversion;
2852 produced = decode_eol (coding, source, destination,
2853 src_bytes, dst_bytes, consumed);
2854 break;
2855
2856 case coding_type_sjis:
2857 produced = decode_coding_sjis_big5 (coding, source, destination,
2858 src_bytes, dst_bytes, consumed,
2859 1);
2860 break;
2861
2862 case coding_type_iso2022:
2863 produced = decode_coding_iso2022 (coding, source, destination,
2864 src_bytes, dst_bytes, consumed);
2865 break;
2866
2867 case coding_type_big5:
2868 produced = decode_coding_sjis_big5 (coding, source, destination,
2869 src_bytes, dst_bytes, consumed,
2870 0);
2871 break;
2872
2873 case coding_type_ccl:
2874 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2875 src_bytes, dst_bytes, consumed);
2876 break;
2877 }
2878
2879 return produced;
2880 }
2881
2882 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2883
2884 int
2885 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2886 struct coding_system *coding;
2887 unsigned char *source, *destination;
2888 int src_bytes, dst_bytes;
2889 int *consumed;
2890 {
2891 int produced;
2892
2893 switch (coding->type)
2894 {
2895 case coding_type_no_conversion:
2896 label_no_conversion:
2897 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2898 if (produced > 0)
2899 {
2900 bcopy (source, destination, produced);
2901 if (coding->selective)
2902 {
2903 unsigned char *p = destination, *pend = destination + produced;
2904 while (p < pend)
2905 if (*p++ == '\015') p[-1] = '\n';
2906 }
2907 }
2908 *consumed = produced;
2909 break;
2910
2911 case coding_type_emacs_mule:
2912 case coding_type_undecided:
2913 if (coding->eol_type == CODING_EOL_LF
2914 || coding->eol_type == CODING_EOL_UNDECIDED)
2915 goto label_no_conversion;
2916 produced = encode_eol (coding, source, destination,
2917 src_bytes, dst_bytes, consumed);
2918 break;
2919
2920 case coding_type_sjis:
2921 produced = encode_coding_sjis_big5 (coding, source, destination,
2922 src_bytes, dst_bytes, consumed,
2923 1);
2924 break;
2925
2926 case coding_type_iso2022:
2927 produced = encode_coding_iso2022 (coding, source, destination,
2928 src_bytes, dst_bytes, consumed);
2929 break;
2930
2931 case coding_type_big5:
2932 produced = encode_coding_sjis_big5 (coding, source, destination,
2933 src_bytes, dst_bytes, consumed,
2934 0);
2935 break;
2936
2937 case coding_type_ccl:
2938 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2939 src_bytes, dst_bytes, consumed);
2940 break;
2941 }
2942
2943 return produced;
2944 }
2945
2946 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2947
2948 /* Return maximum size (bytes) of a buffer enough for decoding
2949 SRC_BYTES of text encoded in CODING. */
2950
2951 int
2952 decoding_buffer_size (coding, src_bytes)
2953 struct coding_system *coding;
2954 int src_bytes;
2955 {
2956 int magnification;
2957
2958 if (coding->type == coding_type_iso2022)
2959 magnification = 3;
2960 else if (coding->type == coding_type_ccl)
2961 magnification = coding->spec.ccl.decoder.buf_magnification;
2962 else
2963 magnification = 2;
2964
2965 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2966 }
2967
2968 /* Return maximum size (bytes) of a buffer enough for encoding
2969 SRC_BYTES of text to CODING. */
2970
2971 int
2972 encoding_buffer_size (coding, src_bytes)
2973 struct coding_system *coding;
2974 int src_bytes;
2975 {
2976 int magnification;
2977
2978 if (coding->type == coding_type_ccl)
2979 magnification = coding->spec.ccl.encoder.buf_magnification;
2980 else
2981 magnification = 3;
2982
2983 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2984 }
2985
2986 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2987 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2988 #endif
2989
2990 char *conversion_buffer;
2991 int conversion_buffer_size;
2992
2993 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2994 or decoding. Sufficient memory is allocated automatically. If we
2995 run out of memory, return NULL. */
2996
2997 char *
2998 get_conversion_buffer (size)
2999 int size;
3000 {
3001 if (size > conversion_buffer_size)
3002 {
3003 char *buf;
3004 int real_size = conversion_buffer_size * 2;
3005
3006 while (real_size < size) real_size *= 2;
3007 buf = (char *) xmalloc (real_size);
3008 xfree (conversion_buffer);
3009 conversion_buffer = buf;
3010 conversion_buffer_size = real_size;
3011 }
3012 return conversion_buffer;
3013 }
3014
3015 \f
3016 #ifdef emacs
3017 /*** 7. Emacs Lisp library functions ***/
3018
3019 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3020 1, 1, 0,
3021 "Return coding-spec of CODING-SYSTEM.\n\
3022 If CODING-SYSTEM is not a valid coding-system, return nil.")
3023 (obj)
3024 Lisp_Object obj;
3025 {
3026 while (SYMBOLP (obj) && !NILP (obj))
3027 obj = Fget (obj, Qcoding_system);
3028 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3029 ? Qnil : obj);
3030 }
3031
3032 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3033 "Return t if OBJECT is nil or a coding-system.\n\
3034 See document of make-coding-system for coding-system object.")
3035 (obj)
3036 Lisp_Object obj;
3037 {
3038 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3039 }
3040
3041 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3042 Sread_non_nil_coding_system, 1, 1, 0,
3043 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3044 (prompt)
3045 Lisp_Object prompt;
3046 {
3047 Lisp_Object val;
3048 do
3049 {
3050 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3051 Qt, Qnil, Qnil, Qnil);
3052 }
3053 while (XSTRING (val)->size == 0);
3054 return (Fintern (val, Qnil));
3055 }
3056
3057 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3058 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3059 (prompt)
3060 Lisp_Object prompt;
3061 {
3062 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3063 Qt, Qnil, Qnil, Qnil);
3064 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3065 }
3066
3067 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3068 1, 1, 0,
3069 "Check validity of CODING-SYSTEM.\n\
3070 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3071 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3072 The value of property should be a vector of length 5.")
3073 (coding_system)
3074 Lisp_Object coding_system;
3075 {
3076 CHECK_SYMBOL (coding_system, 0);
3077 if (!NILP (Fcoding_system_p (coding_system)))
3078 return coding_system;
3079 while (1)
3080 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3081 }
3082
3083 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3084 2, 2, 0,
3085 "Detect coding system of the text in the region between START and END.\n\
3086 Return a list of possible coding systems ordered by priority.\n\
3087 If only ASCII characters are found, it returns `undecided'\n\
3088 or its subsidiary coding system according to a detected end-of-line format.")
3089 (b, e)
3090 Lisp_Object b, e;
3091 {
3092 int coding_mask, eol_type;
3093 Lisp_Object val;
3094 int beg, end;
3095
3096 validate_region (&b, &e);
3097 beg = XINT (b), end = XINT (e);
3098 if (beg < GPT && end >= GPT) move_gap (end);
3099
3100 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3101 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3102
3103 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3104 {
3105 val = intern ("undecided");
3106 if (eol_type != CODING_EOL_UNDECIDED)
3107 {
3108 Lisp_Object val2 = Fget (val, Qeol_type);
3109 if (VECTORP (val2))
3110 val = XVECTOR (val2)->contents[eol_type];
3111 }
3112 }
3113 else
3114 {
3115 Lisp_Object val2;
3116
3117 /* At first, gather possible coding-systems in VAL in a reverse
3118 order. */
3119 val = Qnil;
3120 for (val2 = Vcoding_category_list;
3121 !NILP (val2);
3122 val2 = XCONS (val2)->cdr)
3123 {
3124 int idx
3125 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3126 if (coding_mask & (1 << idx))
3127 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3128 }
3129
3130 /* Then, change the order of the list, while getting subsidiary
3131 coding-systems. */
3132 val2 = val;
3133 val = Qnil;
3134 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3135 {
3136 if (eol_type == CODING_EOL_UNDECIDED)
3137 val = Fcons (XCONS (val2)->car, val);
3138 else
3139 {
3140 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3141 if (VECTORP (val3))
3142 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3143 else
3144 val = Fcons (XCONS (val2)->car, val);
3145 }
3146 }
3147 }
3148
3149 return val;
3150 }
3151
3152 /* Scan text in the region between *BEGP and *ENDP, skip characters
3153 which we never have to encode to (iff ENCODEP is 1) or decode from
3154 coding system CODING at the head and tail, then set BEGP and ENDP
3155 to the addresses of start and end of the text we actually convert. */
3156
3157 void
3158 shrink_conversion_area (begp, endp, coding, encodep)
3159 unsigned char **begp, **endp;
3160 struct coding_system *coding;
3161 int encodep;
3162 {
3163 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3164
3165 if (coding->eol_type != CODING_EOL_LF
3166 && coding->eol_type != CODING_EOL_UNDECIDED)
3167 /* Since we anyway have to convert end-of-line format, it is not
3168 worth skipping at most 100 bytes or so. */
3169 return;
3170
3171 if (encodep) /* for encoding */
3172 {
3173 switch (coding->type)
3174 {
3175 case coding_type_no_conversion:
3176 case coding_type_emacs_mule:
3177 case coding_type_undecided:
3178 /* We need no conversion. */
3179 *begp = *endp;
3180 return;
3181 case coding_type_ccl:
3182 /* We can't skip any data. */
3183 return;
3184 case coding_type_iso2022:
3185 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3186 {
3187 unsigned char *bol = beg_addr;
3188 while (beg_addr < end_addr && *beg_addr < 0x80)
3189 {
3190 beg_addr++;
3191 if (*(beg_addr - 1) == '\n')
3192 bol = beg_addr;
3193 }
3194 beg_addr = bol;
3195 goto label_skip_tail;
3196 }
3197 /* fall down ... */
3198 default:
3199 /* We can skip all ASCII characters at the head and tail. */
3200 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3201 label_skip_tail:
3202 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3203 break;
3204 }
3205 }
3206 else /* for decoding */
3207 {
3208 switch (coding->type)
3209 {
3210 case coding_type_no_conversion:
3211 /* We need no conversion. */
3212 *begp = *endp;
3213 return;
3214 case coding_type_emacs_mule:
3215 if (coding->eol_type == CODING_EOL_LF)
3216 {
3217 /* We need no conversion. */
3218 *begp = *endp;
3219 return;
3220 }
3221 /* We can skip all but carriage-return. */
3222 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3223 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3224 break;
3225 case coding_type_sjis:
3226 case coding_type_big5:
3227 /* We can skip all ASCII characters at the head. */
3228 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3229 /* We can skip all ASCII characters at the tail except for
3230 the second byte of SJIS or BIG5 code. */
3231 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3232 if (end_addr != *endp)
3233 end_addr++;
3234 break;
3235 case coding_type_ccl:
3236 /* We can't skip any data. */
3237 return;
3238 default: /* i.e. case coding_type_iso2022: */
3239 {
3240 unsigned char c;
3241
3242 /* We can skip all ASCII characters except for a few
3243 control codes at the head. */
3244 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3245 && c != ISO_CODE_CR && c != ISO_CODE_SO
3246 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3247 beg_addr++;
3248 }
3249 break;
3250 }
3251 }
3252 *begp = beg_addr;
3253 *endp = end_addr;
3254 return;
3255 }
3256
3257 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3258 text between B and E. B and E are buffer position. */
3259
3260 Lisp_Object
3261 code_convert_region (b, e, coding, encodep)
3262 Lisp_Object b, e;
3263 struct coding_system *coding;
3264 int encodep;
3265 {
3266 int beg, end, len, consumed, produced;
3267 char *buf;
3268 unsigned char *begp, *endp;
3269 int pos = PT;
3270
3271 validate_region (&b, &e);
3272 beg = XINT (b), end = XINT (e);
3273 if (beg < GPT && end >= GPT)
3274 move_gap (end);
3275
3276 if (encodep && !NILP (coding->pre_write_conversion))
3277 {
3278 /* We must call a pre-conversion function which may put a new
3279 text to be converted in a new buffer. */
3280 struct buffer *old = current_buffer, *new;
3281
3282 TEMP_SET_PT (beg);
3283 call2 (coding->pre_write_conversion, b, e);
3284 if (old != current_buffer)
3285 {
3286 /* Replace the original text by the text just generated. */
3287 len = ZV - BEGV;
3288 new = current_buffer;
3289 set_buffer_internal (old);
3290 del_range (beg, end);
3291 insert_from_buffer (new, 1, len, 0);
3292 end = beg + len;
3293 }
3294 }
3295
3296 /* We may be able to shrink the conversion region. */
3297 begp = POS_ADDR (beg); endp = begp + (end - beg);
3298 shrink_conversion_area (&begp, &endp, coding, encodep);
3299
3300 if (begp == endp)
3301 /* We need no conversion. */
3302 len = end - beg;
3303 else
3304 {
3305 beg += begp - POS_ADDR (beg);
3306 end = beg + (endp - begp);
3307
3308 if (encodep)
3309 len = encoding_buffer_size (coding, end - beg);
3310 else
3311 len = decoding_buffer_size (coding, end - beg);
3312 buf = get_conversion_buffer (len);
3313
3314 coding->last_block = 1;
3315 produced = (encodep
3316 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3317 &consumed)
3318 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3319 &consumed));
3320
3321 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3322
3323 TEMP_SET_PT (beg);
3324 insert (buf, produced);
3325 del_range (PT, PT + end - beg);
3326 if (pos >= end)
3327 pos = PT + (pos - end);
3328 else if (pos > beg)
3329 pos = beg;
3330 TEMP_SET_PT (pos);
3331 }
3332
3333 if (!encodep && !NILP (coding->post_read_conversion))
3334 {
3335 /* We must call a post-conversion function which may alter
3336 the text just converted. */
3337 Lisp_Object insval;
3338
3339 beg = XINT (b);
3340 TEMP_SET_PT (beg);
3341 insval = call1 (coding->post_read_conversion, make_number (len));
3342 CHECK_NUMBER (insval, 0);
3343 len = XINT (insval);
3344 }
3345
3346 return make_number (len);
3347 }
3348
3349 Lisp_Object
3350 code_convert_string (str, coding, encodep, nocopy)
3351 Lisp_Object str, nocopy;
3352 struct coding_system *coding;
3353 int encodep;
3354 {
3355 int len, consumed, produced;
3356 char *buf;
3357 unsigned char *begp, *endp;
3358 int head_skip, tail_skip;
3359 struct gcpro gcpro1;
3360
3361 if (encodep && !NILP (coding->pre_write_conversion)
3362 || !encodep && !NILP (coding->post_read_conversion))
3363 {
3364 /* Since we have to call Lisp functions which assume target text
3365 is in a buffer, after setting a temporary buffer, call
3366 code_convert_region. */
3367 int count = specpdl_ptr - specpdl;
3368 int len = XSTRING (str)->size;
3369 Lisp_Object result;
3370 struct buffer *old = current_buffer;
3371
3372 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3373 temp_output_buffer_setup (" *code-converting-work*");
3374 set_buffer_internal (XBUFFER (Vstandard_output));
3375 insert_from_string (str, 0, len, 0);
3376 code_convert_region (make_number (BEGV), make_number (ZV),
3377 coding, encodep);
3378 result = make_buffer_string (BEGV, ZV, 0);
3379 set_buffer_internal (old);
3380 return unbind_to (count, result);
3381 }
3382
3383 /* We may be able to shrink the conversion region. */
3384 begp = XSTRING (str)->data;
3385 endp = begp + XSTRING (str)->size;
3386 shrink_conversion_area (&begp, &endp, coding, encodep);
3387
3388 if (begp == endp)
3389 /* We need no conversion. */
3390 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3391
3392 head_skip = begp - XSTRING (str)->data;
3393 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3394
3395 GCPRO1 (str);
3396
3397 if (encodep)
3398 len = encoding_buffer_size (coding, endp - begp);
3399 else
3400 len = decoding_buffer_size (coding, endp - begp);
3401 buf = get_conversion_buffer (len + head_skip + tail_skip);
3402
3403 bcopy (XSTRING (str)->data, buf, head_skip);
3404 coding->last_block = 1;
3405 produced = (encodep
3406 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3407 buf + head_skip, endp - begp, len, &consumed)
3408 : decode_coding (coding, XSTRING (str)->data + head_skip,
3409 buf + head_skip, endp - begp, len, &consumed));
3410 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3411 buf + head_skip + produced,
3412 tail_skip);
3413
3414 UNGCPRO;
3415
3416 return make_string (buf, head_skip + produced + tail_skip);
3417 }
3418
3419 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3420 3, 3, "r\nzCoding system: ",
3421 "Decode current region by specified coding system.\n\
3422 When called from a program, takes three arguments:\n\
3423 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3424 Return length of decoded text.")
3425 (b, e, coding_system)
3426 Lisp_Object b, e, coding_system;
3427 {
3428 struct coding_system coding;
3429
3430 CHECK_NUMBER_COERCE_MARKER (b, 0);
3431 CHECK_NUMBER_COERCE_MARKER (e, 1);
3432 CHECK_SYMBOL (coding_system, 2);
3433
3434 if (NILP (coding_system))
3435 return make_number (XFASTINT (e) - XFASTINT (b));
3436 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3437 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3438
3439 return code_convert_region (b, e, &coding, 0);
3440 }
3441
3442 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3443 3, 3, "r\nzCoding system: ",
3444 "Encode current region by specified coding system.\n\
3445 When called from a program, takes three arguments:\n\
3446 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3447 Return length of encoded text.")
3448 (b, e, coding_system)
3449 Lisp_Object b, e, coding_system;
3450 {
3451 struct coding_system coding;
3452
3453 CHECK_NUMBER_COERCE_MARKER (b, 0);
3454 CHECK_NUMBER_COERCE_MARKER (e, 1);
3455 CHECK_SYMBOL (coding_system, 2);
3456
3457 if (NILP (coding_system))
3458 return make_number (XFASTINT (e) - XFASTINT (b));
3459 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3460 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3461
3462 return code_convert_region (b, e, &coding, 1);
3463 }
3464
3465 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3466 2, 3, 0,
3467 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3468 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3469 of decoding.")
3470 (string, coding_system, nocopy)
3471 Lisp_Object string, coding_system, nocopy;
3472 {
3473 struct coding_system coding;
3474
3475 CHECK_STRING (string, 0);
3476 CHECK_SYMBOL (coding_system, 1);
3477
3478 if (NILP (coding_system))
3479 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3480 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3481 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3482
3483 return code_convert_string (string, &coding, 0, nocopy);
3484 }
3485
3486 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3487 2, 3, 0,
3488 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3489 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3490 of encoding.")
3491 (string, coding_system, nocopy)
3492 Lisp_Object string, coding_system, nocopy;
3493 {
3494 struct coding_system coding;
3495
3496 CHECK_STRING (string, 0);
3497 CHECK_SYMBOL (coding_system, 1);
3498
3499 if (NILP (coding_system))
3500 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3501 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3502 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3503
3504 return code_convert_string (string, &coding, 1, nocopy);
3505 }
3506
3507 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3508 "Decode a JISX0208 character of shift-jis encoding.\n\
3509 CODE is the character code in SJIS.\n\
3510 Return the corresponding character.")
3511 (code)
3512 Lisp_Object code;
3513 {
3514 unsigned char c1, c2, s1, s2;
3515 Lisp_Object val;
3516
3517 CHECK_NUMBER (code, 0);
3518 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3519 DECODE_SJIS (s1, s2, c1, c2);
3520 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3521 return val;
3522 }
3523
3524 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3525 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3526 Return the corresponding character code in SJIS.")
3527 (ch)
3528 Lisp_Object ch;
3529 {
3530 int charset, c1, c2, s1, s2;
3531 Lisp_Object val;
3532
3533 CHECK_NUMBER (ch, 0);
3534 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3535 if (charset == charset_jisx0208)
3536 {
3537 ENCODE_SJIS (c1, c2, s1, s2);
3538 XSETFASTINT (val, (s1 << 8) | s2);
3539 }
3540 else
3541 XSETFASTINT (val, 0);
3542 return val;
3543 }
3544
3545 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3546 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3547 CODE is the character code in BIG5.\n\
3548 Return the corresponding character.")
3549 (code)
3550 Lisp_Object code;
3551 {
3552 int charset;
3553 unsigned char b1, b2, c1, c2;
3554 Lisp_Object val;
3555
3556 CHECK_NUMBER (code, 0);
3557 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3558 DECODE_BIG5 (b1, b2, charset, c1, c2);
3559 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3560 return val;
3561 }
3562
3563 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3564 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3565 Return the corresponding character code in Big5.")
3566 (ch)
3567 Lisp_Object ch;
3568 {
3569 int charset, c1, c2, b1, b2;
3570 Lisp_Object val;
3571
3572 CHECK_NUMBER (ch, 0);
3573 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3574 if (charset == charset_big5_1 || charset == charset_big5_2)
3575 {
3576 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3577 XSETFASTINT (val, (b1 << 8) | b2);
3578 }
3579 else
3580 XSETFASTINT (val, 0);
3581 return val;
3582 }
3583
3584 DEFUN ("set-terminal-coding-system-internal",
3585 Fset_terminal_coding_system_internal,
3586 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3587 (coding_system)
3588 Lisp_Object coding_system;
3589 {
3590 CHECK_SYMBOL (coding_system, 0);
3591 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3592 /* We had better not send unexpected characters to terminal. */
3593 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3594
3595 return Qnil;
3596 }
3597
3598 DEFUN ("set-safe-terminal-coding-system-internal",
3599 Fset_safe_terminal_coding_system_internal,
3600 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3601 (coding_system)
3602 Lisp_Object coding_system;
3603 {
3604 CHECK_SYMBOL (coding_system, 0);
3605 setup_coding_system (Fcheck_coding_system (coding_system),
3606 &safe_terminal_coding);
3607 return Qnil;
3608 }
3609
3610 DEFUN ("terminal-coding-system",
3611 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3612 "Return coding-system of your terminal.")
3613 ()
3614 {
3615 return terminal_coding.symbol;
3616 }
3617
3618 DEFUN ("set-keyboard-coding-system-internal",
3619 Fset_keyboard_coding_system_internal,
3620 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3621 (coding_system)
3622 Lisp_Object coding_system;
3623 {
3624 CHECK_SYMBOL (coding_system, 0);
3625 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3626 return Qnil;
3627 }
3628
3629 DEFUN ("keyboard-coding-system",
3630 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3631 "Return coding-system of what is sent from terminal keyboard.")
3632 ()
3633 {
3634 return keyboard_coding.symbol;
3635 }
3636
3637 \f
3638 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3639 Sfind_operation_coding_system, 1, MANY, 0,
3640 "Choose a coding system for an operation based on the target name.\n\
3641 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3642 DECODING-SYSTEM is the coding system to use for decoding\n\
3643 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3644 for encoding (in case OPERATION does encoding).\n\
3645 \n\
3646 The first argument OPERATION specifies an I/O primitive:\n\
3647 For file I/O, `insert-file-contents' or `write-region'.\n\
3648 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3649 For network I/O, `open-network-stream'.\n\
3650 \n\
3651 The remaining arguments should be the same arguments that were passed\n\
3652 to the primitive. Depending on which primitive, one of those arguments\n\
3653 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3654 whichever argument specifies the file name is TARGET.\n\
3655 \n\
3656 TARGET has a meaning which depends on OPERATION:\n\
3657 For file I/O, TARGET is a file name.\n\
3658 For process I/O, TARGET is a process name.\n\
3659 For network I/O, TARGET is a service name or a port number\n\
3660 \n\
3661 This function looks up what specified for TARGET in,\n\
3662 `file-coding-system-alist', `process-coding-system-alist',\n\
3663 or `network-coding-system-alist' depending on OPERATION.\n\
3664 They may specify a coding system, a cons of coding systems,\n\
3665 or a function symbol to call.\n\
3666 In the last case, we call the function with one argument,\n\
3667 which is a list of all the arguments given to this function.")
3668 (nargs, args)
3669 int nargs;
3670 Lisp_Object *args;
3671 {
3672 Lisp_Object operation, target_idx, target, val;
3673 register Lisp_Object chain;
3674
3675 if (nargs < 2)
3676 error ("Too few arguments");
3677 operation = args[0];
3678 if (!SYMBOLP (operation)
3679 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3680 error ("Invalid first arguement");
3681 if (nargs < 1 + XINT (target_idx))
3682 error ("Too few arguments for operation: %s",
3683 XSYMBOL (operation)->name->data);
3684 target = args[XINT (target_idx) + 1];
3685 if (!(STRINGP (target)
3686 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3687 error ("Invalid %dth argument", XINT (target_idx) + 1);
3688
3689 chain = ((EQ (operation, Qinsert_file_contents)
3690 || EQ (operation, Qwrite_region))
3691 ? Vfile_coding_system_alist
3692 : (EQ (operation, Qopen_network_stream)
3693 ? Vnetwork_coding_system_alist
3694 : Vprocess_coding_system_alist));
3695 if (NILP (chain))
3696 return Qnil;
3697
3698 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3699 {
3700 Lisp_Object elt = XCONS (chain)->car;
3701
3702 if (CONSP (elt)
3703 && ((STRINGP (target)
3704 && STRINGP (XCONS (elt)->car)
3705 && fast_string_match (XCONS (elt)->car, target) >= 0)
3706 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3707 {
3708 val = XCONS (elt)->cdr;
3709 if (CONSP (val))
3710 return val;
3711 if (! SYMBOLP (val))
3712 return Qnil;
3713 if (! NILP (Fcoding_system_p (val)))
3714 return Fcons (val, val);
3715 if (!NILP (Ffboundp (val)))
3716 return call1 (val, Flist (nargs, args));
3717 return Qnil;
3718 }
3719 }
3720 return Qnil;
3721 }
3722
3723 #endif /* emacs */
3724
3725 \f
3726 /*** 8. Post-amble ***/
3727
3728 init_coding_once ()
3729 {
3730 int i;
3731
3732 /* Emacs' internal format specific initialize routine. */
3733 for (i = 0; i <= 0x20; i++)
3734 emacs_code_class[i] = EMACS_control_code;
3735 emacs_code_class[0x0A] = EMACS_linefeed_code;
3736 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3737 for (i = 0x21 ; i < 0x7F; i++)
3738 emacs_code_class[i] = EMACS_ascii_code;
3739 emacs_code_class[0x7F] = EMACS_control_code;
3740 emacs_code_class[0x80] = EMACS_leading_code_composition;
3741 for (i = 0x81; i < 0xFF; i++)
3742 emacs_code_class[i] = EMACS_invalid_code;
3743 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3744 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3745 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3746 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3747
3748 /* ISO2022 specific initialize routine. */
3749 for (i = 0; i < 0x20; i++)
3750 iso_code_class[i] = ISO_control_code;
3751 for (i = 0x21; i < 0x7F; i++)
3752 iso_code_class[i] = ISO_graphic_plane_0;
3753 for (i = 0x80; i < 0xA0; i++)
3754 iso_code_class[i] = ISO_control_code;
3755 for (i = 0xA1; i < 0xFF; i++)
3756 iso_code_class[i] = ISO_graphic_plane_1;
3757 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3758 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3759 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3760 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3761 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3762 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3763 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3764 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3765 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3766 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3767
3768 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3769 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3770
3771 setup_coding_system (Qnil, &keyboard_coding);
3772 setup_coding_system (Qnil, &terminal_coding);
3773 setup_coding_system (Qnil, &safe_terminal_coding);
3774
3775 #if defined (MSDOS) || defined (WINDOWSNT)
3776 system_eol_type = CODING_EOL_CRLF;
3777 #else
3778 system_eol_type = CODING_EOL_LF;
3779 #endif
3780 }
3781
3782 #ifdef emacs
3783
3784 syms_of_coding ()
3785 {
3786 Qtarget_idx = intern ("target-idx");
3787 staticpro (&Qtarget_idx);
3788
3789 /* Target FILENAME is the first argument. */
3790 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3791 /* Target FILENAME is the third argument. */
3792 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3793
3794 Qcall_process = intern ("call-process");
3795 staticpro (&Qcall_process);
3796 /* Target PROGRAM is the first argument. */
3797 Fput (Qcall_process, Qtarget_idx, make_number (0));
3798
3799 Qcall_process_region = intern ("call-process-region");
3800 staticpro (&Qcall_process_region);
3801 /* Target PROGRAM is the third argument. */
3802 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3803
3804 Qstart_process = intern ("start-process");
3805 staticpro (&Qstart_process);
3806 /* Target PROGRAM is the third argument. */
3807 Fput (Qstart_process, Qtarget_idx, make_number (2));
3808
3809 Qopen_network_stream = intern ("open-network-stream");
3810 staticpro (&Qopen_network_stream);
3811 /* Target SERVICE is the fourth argument. */
3812 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3813
3814 Qcoding_system = intern ("coding-system");
3815 staticpro (&Qcoding_system);
3816
3817 Qeol_type = intern ("eol-type");
3818 staticpro (&Qeol_type);
3819
3820 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3821 staticpro (&Qbuffer_file_coding_system);
3822
3823 Qpost_read_conversion = intern ("post-read-conversion");
3824 staticpro (&Qpost_read_conversion);
3825
3826 Qpre_write_conversion = intern ("pre-write-conversion");
3827 staticpro (&Qpre_write_conversion);
3828
3829 Qcoding_system_spec = intern ("coding-system-spec");
3830 staticpro (&Qcoding_system_spec);
3831
3832 Qcoding_system_p = intern ("coding-system-p");
3833 staticpro (&Qcoding_system_p);
3834
3835 Qcoding_system_error = intern ("coding-system-error");
3836 staticpro (&Qcoding_system_error);
3837
3838 Fput (Qcoding_system_error, Qerror_conditions,
3839 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3840 Fput (Qcoding_system_error, Qerror_message,
3841 build_string ("Invalid coding system"));
3842
3843 Qcoding_category_index = intern ("coding-category-index");
3844 staticpro (&Qcoding_category_index);
3845
3846 {
3847 int i;
3848 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3849 {
3850 coding_category_table[i] = intern (coding_category_name[i]);
3851 staticpro (&coding_category_table[i]);
3852 Fput (coding_category_table[i], Qcoding_category_index,
3853 make_number (i));
3854 }
3855 }
3856
3857 Qcharacter_unification_table = intern ("character-unification-table");
3858 staticpro (&Qcharacter_unification_table);
3859 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3860 make_number (0));
3861
3862 Qcharacter_unification_table_for_decode
3863 = intern ("character-unification-table-for-decode");
3864 staticpro (&Qcharacter_unification_table_for_decode);
3865
3866 Qcharacter_unification_table_for_encode
3867 = intern ("character-unification-table-for-encode");
3868 staticpro (&Qcharacter_unification_table_for_encode);
3869
3870 Qemacs_mule = intern ("emacs-mule");
3871 staticpro (&Qemacs_mule);
3872
3873 defsubr (&Scoding_system_spec);
3874 defsubr (&Scoding_system_p);
3875 defsubr (&Sread_coding_system);
3876 defsubr (&Sread_non_nil_coding_system);
3877 defsubr (&Scheck_coding_system);
3878 defsubr (&Sdetect_coding_region);
3879 defsubr (&Sdecode_coding_region);
3880 defsubr (&Sencode_coding_region);
3881 defsubr (&Sdecode_coding_string);
3882 defsubr (&Sencode_coding_string);
3883 defsubr (&Sdecode_sjis_char);
3884 defsubr (&Sencode_sjis_char);
3885 defsubr (&Sdecode_big5_char);
3886 defsubr (&Sencode_big5_char);
3887 defsubr (&Sset_terminal_coding_system_internal);
3888 defsubr (&Sset_safe_terminal_coding_system_internal);
3889 defsubr (&Sterminal_coding_system);
3890 defsubr (&Sset_keyboard_coding_system_internal);
3891 defsubr (&Skeyboard_coding_system);
3892 defsubr (&Sfind_operation_coding_system);
3893
3894 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3895 "List of coding-categories (symbols) ordered by priority.");
3896 {
3897 int i;
3898
3899 Vcoding_category_list = Qnil;
3900 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3901 Vcoding_category_list
3902 = Fcons (coding_category_table[i], Vcoding_category_list);
3903 }
3904
3905 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3906 "Specify the coding system for read operations.\n\
3907 It is useful to bind this variable with `let', but do not set it globally.\n\
3908 If the value is a coding system, it is used for decoding on read operation.\n\
3909 If not, an appropriate element is used from one of the coding system alists:\n\
3910 There are three such tables, `file-coding-system-alist',\n\
3911 `process-coding-system-alist', and `network-coding-system-alist'.");
3912 Vcoding_system_for_read = Qnil;
3913
3914 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3915 "Specify the coding system for write operations.\n\
3916 It is useful to bind this variable with `let', but do not set it globally.\n\
3917 If the value is a coding system, it is used for encoding on write operation.\n\
3918 If not, an appropriate element is used from one of the coding system alists:\n\
3919 There are three such tables, `file-coding-system-alist',\n\
3920 `process-coding-system-alist', and `network-coding-system-alist'.");
3921 Vcoding_system_for_write = Qnil;
3922
3923 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3924 "Coding system used in the latest file or process I/O.");
3925 Vlast_coding_system_used = Qnil;
3926
3927 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3928 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3929 inhibit_eol_conversion = 0;
3930
3931 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3932 "Alist to decide a coding system to use for a file I/O operation.\n\
3933 The format is ((PATTERN . VAL) ...),\n\
3934 where PATTERN is a regular expression matching a file name,\n\
3935 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3936 If VAL is a coding system, it is used for both decoding and encoding\n\
3937 the file contents.\n\
3938 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3939 and the cdr part is used for encoding.\n\
3940 If VAL is a function symbol, the function must return a coding system\n\
3941 or a cons of coding systems which are used as above.\n\
3942 \n\
3943 See also the function `find-operation-coding-system'.");
3944 Vfile_coding_system_alist = Qnil;
3945
3946 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3947 "Alist to decide a coding system to use for a process I/O operation.\n\
3948 The format is ((PATTERN . VAL) ...),\n\
3949 where PATTERN is a regular expression matching a program name,\n\
3950 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3951 If VAL is a coding system, it is used for both decoding what received\n\
3952 from the program and encoding what sent to the program.\n\
3953 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3954 and the cdr part is used for encoding.\n\
3955 If VAL is a function symbol, the function must return a coding system\n\
3956 or a cons of coding systems which are used as above.\n\
3957 \n\
3958 See also the function `find-operation-coding-system'.");
3959 Vprocess_coding_system_alist = Qnil;
3960
3961 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3962 "Alist to decide a coding system to use for a network I/O operation.\n\
3963 The format is ((PATTERN . VAL) ...),\n\
3964 where PATTERN is a regular expression matching a network service name\n\
3965 or is a port number to connect to,\n\
3966 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3967 If VAL is a coding system, it is used for both decoding what received\n\
3968 from the network stream and encoding what sent to the network stream.\n\
3969 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3970 and the cdr part is used for encoding.\n\
3971 If VAL is a function symbol, the function must return a coding system\n\
3972 or a cons of coding systems which are used as above.\n\
3973 \n\
3974 See also the function `find-operation-coding-system'.");
3975 Vnetwork_coding_system_alist = Qnil;
3976
3977 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3978 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3979 eol_mnemonic_unix = ':';
3980
3981 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3982 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3983 eol_mnemonic_dos = '\\';
3984
3985 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3986 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3987 eol_mnemonic_mac = '/';
3988
3989 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3990 "Mnemonic character indicating end-of-line format is not yet decided.");
3991 eol_mnemonic_undecided = ':';
3992
3993 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3994 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3995 Venable_character_unification = Qt;
3996
3997 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3998 &Vstandard_character_unification_table_for_decode,
3999 "Table for unifying characters when reading.");
4000 Vstandard_character_unification_table_for_decode = Qnil;
4001
4002 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4003 &Vstandard_character_unification_table_for_encode,
4004 "Table for unifying characters when writing.");
4005 Vstandard_character_unification_table_for_encode = Qnil;
4006
4007 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4008 "Alist of charsets vs revision numbers.\n\
4009 While encoding, if a charset (car part of an element) is found,\n\
4010 designate it with the escape sequence identifing revision (cdr part of the element).");
4011 Vcharset_revision_alist = Qnil;
4012
4013 DEFVAR_LISP ("default-process-coding-system",
4014 &Vdefault_process_coding_system,
4015 "Cons of coding systems used for process I/O by default.\n\
4016 The car part is used for decoding a process output,\n\
4017 the cdr part is used for encoding a text to be sent to a process.");
4018 Vdefault_process_coding_system = Qnil;
4019
4020 DEFVAR_LISP ("special-microsoft-code-table", &Vmicrosoft_code_table,
4021 "Table of special Microsoft codes in the range 128..159 (inclusive).\n\
4022 This is a vector of length 256.\n\
4023 If Nth element is non-nil, the existence of code N in a file\n\
4024 (or output of subprocess) doesn't prevent it to be detected as\n\
4025 a coding system of ISO 2022 variant (e.g. iso-latin-1) on reading a file\n\
4026 or reading output of a subprocess.\n\
4027 Only 128th through 159th elements has a meaning.");
4028 Vmicrosoft_code_table = Fmake_vector (make_number (256), Qnil);
4029 }
4030
4031 #endif /* emacs */