]> code.delx.au - gnu-emacs/blob - src/coding.c
(ENCODE_ISO_CHARACTER_DIMENSION1): Pay attention to
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Other
71
72 If a user wants to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represents a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See section 6 for more details.
81
82 */
83
84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
85
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
88 whereas DOS's format is two-byte sequence of `carriage-return' and
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
90
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
93 any format of end-of-line. So, Emacs has information of format of
94 end-of-line in each coding-system. See section 6 for more details.
95
96 */
97
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105 #if 0
106 int
107 detect_coding_emacs_mule (src, src_end)
108 unsigned char *src, *src_end;
109 {
110 ...
111 }
112 #endif
113
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format (emacs-mule). The resulting text
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
122 #if 0
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128 {
129 ...
130 }
131 #endif
132
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
141 #if 0
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147 {
148 ...
149 }
150 #endif
151
152 /*** COMMONLY USED MACROS ***/
153
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160 #define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168 #define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176 #define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191 /* Decode one ASCII character C. */
192
193 #define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
202 position-code is C. */
203
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
217 position-codes are C1 and C2. */
218
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225 \f
226 /*** 1. Preamble ***/
227
228 #include <stdio.h>
229
230 #ifdef emacs
231
232 #include <config.h>
233 #include "lisp.h"
234 #include "buffer.h"
235 #include "charset.h"
236 #include "ccl.h"
237 #include "coding.h"
238 #include "window.h"
239
240 #else /* not emacs */
241
242 #include "mulelib.h"
243
244 #endif /* not emacs */
245
246 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx;
254
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257 /* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259 int eol_mnemonic_undecided;
260
261 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263 int system_eol_type;
264
265 #ifdef emacs
266
267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
268
269 /* Coding system emacs-mule is for converting only end-of-line format. */
270 Lisp_Object Qemacs_mule;
271
272 /* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274 /* Coding-system for reading files and receiving data from process. */
275 Lisp_Object Vcoding_system_for_read;
276 /* Coding-system for writing files and sending data to process. */
277 Lisp_Object Vcoding_system_for_write;
278 /* Coding-system actually used in the latest I/O. */
279 Lisp_Object Vlast_coding_system_used;
280
281 /* A vector of length 256 which contains information about special
282 Microsoft codes. */
283 Lisp_Object Vmicrosoft_code_table;
284
285 /* Flag to inhibit code conversion of end-of-line format. */
286 int inhibit_eol_conversion;
287
288 /* Coding system to be used to encode text for terminal display. */
289 struct coding_system terminal_coding;
290
291 /* Coding system to be used to encode text for terminal display when
292 terminal coding system is nil. */
293 struct coding_system safe_terminal_coding;
294
295 /* Coding system of what is sent from terminal keyboard. */
296 struct coding_system keyboard_coding;
297
298 Lisp_Object Vfile_coding_system_alist;
299 Lisp_Object Vprocess_coding_system_alist;
300 Lisp_Object Vnetwork_coding_system_alist;
301
302 #endif /* emacs */
303
304 Lisp_Object Qcoding_category_index;
305
306 /* List of symbols `coding-category-xxx' ordered by priority. */
307 Lisp_Object Vcoding_category_list;
308
309 /* Table of coding-systems currently assigned to each coding-category. */
310 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
311
312 /* Table of names of symbol for each coding-category. */
313 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
314 "coding-category-emacs-mule",
315 "coding-category-sjis",
316 "coding-category-iso-7",
317 "coding-category-iso-8-1",
318 "coding-category-iso-8-2",
319 "coding-category-iso-7-else",
320 "coding-category-iso-8-else",
321 "coding-category-big5",
322 "coding-category-binary"
323 };
324
325 /* Flag to tell if we look up unification table on character code
326 conversion. */
327 Lisp_Object Venable_character_unification;
328 /* Standard unification table to look up on decoding (reading). */
329 Lisp_Object Vstandard_character_unification_table_for_decode;
330 /* Standard unification table to look up on encoding (writing). */
331 Lisp_Object Vstandard_character_unification_table_for_encode;
332
333 Lisp_Object Qcharacter_unification_table;
334 Lisp_Object Qcharacter_unification_table_for_decode;
335 Lisp_Object Qcharacter_unification_table_for_encode;
336
337 /* Alist of charsets vs revision number. */
338 Lisp_Object Vcharset_revision_alist;
339
340 /* Default coding systems used for process I/O. */
341 Lisp_Object Vdefault_process_coding_system;
342
343 \f
344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
345
346 /* Emacs' internal format for encoding multiple character sets is a
347 kind of multi-byte encoding, i.e. characters are encoded by
348 variable-length sequences of one-byte codes. ASCII characters
349 and control characters (e.g. `tab', `newline') are represented by
350 one-byte sequences which are their ASCII codes, in the range 0x00
351 through 0x7F. The other characters are represented by a sequence
352 of `base leading-code', optional `extended leading-code', and one
353 or two `position-code's. The length of the sequence is determined
354 by the base leading-code. Leading-code takes the range 0x80
355 through 0x9F, whereas extended leading-code and position-code take
356 the range 0xA0 through 0xFF. See `charset.h' for more details
357 about leading-code and position-code.
358
359 There's one exception to this rule. Special leading-code
360 `leading-code-composition' denotes that the following several
361 characters should be composed into one character. Leading-codes of
362 components (except for ASCII) are added 0x20. An ASCII character
363 component is represented by a 2-byte sequence of `0xA0' and
364 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
365 details of composite character. Hence, we can summarize the code
366 range as follows:
367
368 --- CODE RANGE of Emacs' internal format ---
369 (character set) (range)
370 ASCII 0x00 .. 0x7F
371 ELSE (1st byte) 0x80 .. 0x9F
372 (rest bytes) 0xA0 .. 0xFF
373 ---------------------------------------------
374
375 */
376
377 enum emacs_code_class_type emacs_code_class[256];
378
379 /* Go to the next statement only if *SRC is accessible and the code is
380 greater than 0xA0. */
381 #define CHECK_CODE_RANGE_A0_FF \
382 do { \
383 if (src >= src_end) \
384 goto label_end_of_switch; \
385 else if (*src++ < 0xA0) \
386 return 0; \
387 } while (0)
388
389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
390 Check if a text is encoded in Emacs' internal format. If it is,
391 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
392
393 int
394 detect_coding_emacs_mule (src, src_end)
395 unsigned char *src, *src_end;
396 {
397 unsigned char c;
398 int composing = 0;
399
400 while (src < src_end)
401 {
402 c = *src++;
403
404 if (composing)
405 {
406 if (c < 0xA0)
407 composing = 0;
408 else
409 c -= 0x20;
410 }
411
412 switch (emacs_code_class[c])
413 {
414 case EMACS_ascii_code:
415 case EMACS_linefeed_code:
416 break;
417
418 case EMACS_control_code:
419 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
420 return 0;
421 break;
422
423 case EMACS_invalid_code:
424 return 0;
425
426 case EMACS_leading_code_composition: /* c == 0x80 */
427 if (composing)
428 CHECK_CODE_RANGE_A0_FF;
429 else
430 composing = 1;
431 break;
432
433 case EMACS_leading_code_4:
434 CHECK_CODE_RANGE_A0_FF;
435 /* fall down to check it two more times ... */
436
437 case EMACS_leading_code_3:
438 CHECK_CODE_RANGE_A0_FF;
439 /* fall down to check it one more time ... */
440
441 case EMACS_leading_code_2:
442 CHECK_CODE_RANGE_A0_FF;
443 break;
444
445 default:
446 label_end_of_switch:
447 break;
448 }
449 }
450 return CODING_CATEGORY_MASK_EMACS_MULE;
451 }
452
453 \f
454 /*** 3. ISO2022 handlers ***/
455
456 /* The following note describes the coding system ISO2022 briefly.
457 Since the intention of this note is to help in understanding of
458 the programs in this file, some parts are NOT ACCURATE or OVERLY
459 SIMPLIFIED. For the thorough understanding, please refer to the
460 original document of ISO2022.
461
462 ISO2022 provides many mechanisms to encode several character sets
463 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
464 all text is encoded by codes of less than 128. This may make the
465 encoded text a little bit longer, but the text gets more stability
466 to pass through several gateways (some of them strip off the MSB).
467
468 There are two kinds of character set: control character set and
469 graphic character set. The former contains control characters such
470 as `newline' and `escape' to provide control functions (control
471 functions are provided also by escape sequences). The latter
472 contains graphic characters such as ' A' and '-'. Emacs recognizes
473 two control character sets and many graphic character sets.
474
475 Graphic character sets are classified into one of the following
476 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
477 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
478 bytes (DIMENSION) and the number of characters in one dimension
479 (CHARS) of the set. In addition, each character set is assigned an
480 identification tag (called "final character" and denoted as <F>
481 here after) which is unique in each class. <F> of each character
482 set is decided by ECMA(*) when it is registered in ISO. Code range
483 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
484
485 Note (*): ECMA = European Computer Manufacturers Association
486
487 Here are examples of graphic character set [NAME(<F>)]:
488 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
489 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
490 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
491 o DIMENSION2_CHARS96 -- none for the moment
492
493 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
494 C0 [0x00..0x1F] -- control character plane 0
495 GL [0x20..0x7F] -- graphic character plane 0
496 C1 [0x80..0x9F] -- control character plane 1
497 GR [0xA0..0xFF] -- graphic character plane 1
498
499 A control character set is directly designated and invoked to C0 or
500 C1 by an escape sequence. The most common case is that ISO646's
501 control character set is designated/invoked to C0 and ISO6429's
502 control character set is designated/invoked to C1, and usually
503 these designations/invocations are omitted in a coded text. With
504 7-bit environment, only C0 can be used, and a control character for
505 C1 is encoded by an appropriate escape sequence to fit in the
506 environment. All control characters for C1 are defined the
507 corresponding escape sequences.
508
509 A graphic character set is at first designated to one of four
510 graphic registers (G0 through G3), then these graphic registers are
511 invoked to GL or GR. These designations and invocations can be
512 done independently. The most common case is that G0 is invoked to
513 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
514 these invocations and designations are omitted in a coded text.
515 With 7-bit environment, only GL can be used.
516
517 When a graphic character set of CHARS94 is invoked to GL, code 0x20
518 and 0x7F of GL area work as control characters SPACE and DEL
519 respectively, and code 0xA0 and 0xFF of GR area should not be used.
520
521 There are two ways of invocation: locking-shift and single-shift.
522 With locking-shift, the invocation lasts until the next different
523 invocation, whereas with single-shift, the invocation works only
524 for the following character and doesn't affect locking-shift.
525 Invocations are done by the following control characters or escape
526 sequences.
527
528 ----------------------------------------------------------------------
529 function control char escape sequence description
530 ----------------------------------------------------------------------
531 SI (shift-in) 0x0F none invoke G0 to GL
532 SO (shift-out) 0x0E none invoke G1 to GL
533 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
534 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
535 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
536 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
537 ----------------------------------------------------------------------
538 The first four are for locking-shift. Control characters for these
539 functions are defined by macros ISO_CODE_XXX in `coding.h'.
540
541 Designations are done by the following escape sequences.
542 ----------------------------------------------------------------------
543 escape sequence description
544 ----------------------------------------------------------------------
545 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
546 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
547 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
548 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
549 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
550 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
551 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
552 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
553 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
554 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
555 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
556 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
557 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
558 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
559 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
560 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
561 ----------------------------------------------------------------------
562
563 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
564 of dimension 1, chars 94, and final character <F>, and etc.
565
566 Note (*): Although these designations are not allowed in ISO2022,
567 Emacs accepts them on decoding, and produces them on encoding
568 CHARS96 character set in a coding system which is characterized as
569 7-bit environment, non-locking-shift, and non-single-shift.
570
571 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
572 '(' can be omitted. We call this as "short-form" here after.
573
574 Now you may notice that there are a lot of ways for encoding the
575 same multilingual text in ISO2022. Actually, there exists many
576 coding systems such as Compound Text (used in X's inter client
577 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
578 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
579 localized platforms), and all of these are variants of ISO2022.
580
581 In addition to the above, Emacs handles two more kinds of escape
582 sequences: ISO6429's direction specification and Emacs' private
583 sequence for specifying character composition.
584
585 ISO6429's direction specification takes the following format:
586 o CSI ']' -- end of the current direction
587 o CSI '0' ']' -- end of the current direction
588 o CSI '1' ']' -- start of left-to-right text
589 o CSI '2' ']' -- start of right-to-left text
590 The control character CSI (0x9B: control sequence introducer) is
591 abbreviated to the escape sequence ESC '[' in 7-bit environment.
592
593 Character composition specification takes the following format:
594 o ESC '0' -- start character composition
595 o ESC '1' -- end character composition
596 Since these are not standard escape sequences of any ISO, the use
597 of them for these meaning is restricted to Emacs only. */
598
599 enum iso_code_class_type iso_code_class[256];
600
601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
602 Check if a text is encoded in ISO2022. If it is, returns an
603 integer in which appropriate flag bits any of:
604 CODING_CATEGORY_MASK_ISO_7
605 CODING_CATEGORY_MASK_ISO_8_1
606 CODING_CATEGORY_MASK_ISO_8_2
607 CODING_CATEGORY_MASK_ISO_7_ELSE
608 CODING_CATEGORY_MASK_ISO_8_ELSE
609 are set. If a code which should never appear in ISO2022 is found,
610 returns 0. */
611
612 int
613 detect_coding_iso2022 (src, src_end)
614 unsigned char *src, *src_end;
615 {
616 int mask = (CODING_CATEGORY_MASK_ISO_7
617 | CODING_CATEGORY_MASK_ISO_8_1
618 | CODING_CATEGORY_MASK_ISO_8_2
619 | CODING_CATEGORY_MASK_ISO_7_ELSE
620 | CODING_CATEGORY_MASK_ISO_8_ELSE
621 );
622 int g1 = 0; /* 1 iff designating to G1. */
623 int c, i;
624
625 while (src < src_end)
626 {
627 c = *src++;
628 switch (c)
629 {
630 case ISO_CODE_ESC:
631 if (src >= src_end)
632 break;
633 c = *src++;
634 if ((c >= '(' && c <= '/'))
635 {
636 /* Designation sequence for a charset of dimension 1. */
637 if (src >= src_end)
638 break;
639 c = *src++;
640 if (c < ' ' || c >= 0x80)
641 /* Invalid designation sequence. */
642 return 0;
643 }
644 else if (c == '$')
645 {
646 /* Designation sequence for a charset of dimension 2. */
647 if (src >= src_end)
648 break;
649 c = *src++;
650 if (c >= '@' && c <= 'B')
651 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
652 ;
653 else if (c >= '(' && c <= '/')
654 {
655 if (src >= src_end)
656 break;
657 c = *src++;
658 if (c < ' ' || c >= 0x80)
659 /* Invalid designation sequence. */
660 return 0;
661 }
662 else
663 /* Invalid designation sequence. */
664 return 0;
665 }
666 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
667 /* Locking shift. */
668 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
669 | CODING_CATEGORY_MASK_ISO_8_ELSE);
670 else if (c == '0' || c == '1' || c == '2')
671 /* Start/end composition. */
672 ;
673 else
674 /* Invalid escape sequence. */
675 return 0;
676 break;
677
678 case ISO_CODE_SO:
679 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
680 | CODING_CATEGORY_MASK_ISO_8_ELSE);
681 break;
682
683 case ISO_CODE_CSI:
684 case ISO_CODE_SS2:
685 case ISO_CODE_SS3:
686 return CODING_CATEGORY_MASK_ISO_8_ELSE;
687
688 default:
689 if (c < 0x80)
690 break;
691 else if (c < 0xA0)
692 {
693 if (VECTORP (Vmicrosoft_code_table)
694 && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
695 {
696 mask &= ~(CODING_CATEGORY_MASK_ISO_7
697 | CODING_CATEGORY_MASK_ISO_7_ELSE);
698 break;
699 }
700 return 0;
701 }
702 else
703 {
704 unsigned char *src_begin = src;
705
706 mask &= ~(CODING_CATEGORY_MASK_ISO_7
707 | CODING_CATEGORY_MASK_ISO_7_ELSE);
708 while (src < src_end && *src >= 0xA0)
709 src++;
710 if ((src - src_begin - 1) & 1 && src < src_end)
711 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
712 }
713 break;
714 }
715 }
716
717 return mask;
718 }
719
720 /* Decode a character of which charset is CHARSET and the 1st position
721 code is C1. If dimension of CHARSET is 2, the 2nd position code is
722 fetched from SRC and set to C2. If CHARSET is negative, it means
723 that we are decoding ill formed text, and what we can do is just to
724 read C1 as is. */
725
726 #define DECODE_ISO_CHARACTER(charset, c1) \
727 do { \
728 int c_alt, charset_alt = (charset); \
729 if (COMPOSING_HEAD_P (coding->composing)) \
730 { \
731 *dst++ = LEADING_CODE_COMPOSITION; \
732 if (COMPOSING_WITH_RULE_P (coding->composing)) \
733 /* To tell composition rules are embeded. */ \
734 *dst++ = 0xFF; \
735 coding->composing += 2; \
736 } \
737 if ((charset) >= 0) \
738 { \
739 if (CHARSET_DIMENSION (charset) == 2) \
740 ONE_MORE_BYTE (c2); \
741 if (!NILP (unification_table) \
742 && ((c_alt = unify_char (unification_table, \
743 -1, (charset), c1, c2)) >= 0)) \
744 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
745 } \
746 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
747 DECODE_CHARACTER_ASCII (c1); \
748 else if (CHARSET_DIMENSION (charset_alt) == 1) \
749 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
750 else \
751 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
752 if (COMPOSING_WITH_RULE_P (coding->composing)) \
753 /* To tell a composition rule follows. */ \
754 coding->composing = COMPOSING_WITH_RULE_RULE; \
755 } while (0)
756
757 /* Set designation state into CODING. */
758 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
759 do { \
760 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
761 make_number (chars), \
762 make_number (final_char)); \
763 if (charset >= 0) \
764 { \
765 if (coding->direction == 1 \
766 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
767 charset = CHARSET_REVERSE_CHARSET (charset); \
768 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
769 } \
770 } while (0)
771
772 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
773
774 int
775 decode_coding_iso2022 (coding, source, destination,
776 src_bytes, dst_bytes, consumed)
777 struct coding_system *coding;
778 unsigned char *source, *destination;
779 int src_bytes, dst_bytes;
780 int *consumed;
781 {
782 unsigned char *src = source;
783 unsigned char *src_end = source + src_bytes;
784 unsigned char *dst = destination;
785 unsigned char *dst_end = destination + dst_bytes;
786 /* Since the maximum bytes produced by each loop is 7, we subtract 6
787 from DST_END to assure that overflow checking is necessary only
788 at the head of loop. */
789 unsigned char *adjusted_dst_end = dst_end - 6;
790 int charset;
791 /* Charsets invoked to graphic plane 0 and 1 respectively. */
792 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
793 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
794 Lisp_Object unification_table
795 = coding->character_unification_table_for_decode;
796
797 if (!NILP (Venable_character_unification) && NILP (unification_table))
798 unification_table = Vstandard_character_unification_table_for_decode;
799
800 while (src < src_end && dst < adjusted_dst_end)
801 {
802 /* SRC_BASE remembers the start position in source in each loop.
803 The loop will be exited when there's not enough source text
804 to analyze long escape sequence or 2-byte code (within macros
805 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
806 to SRC_BASE before exiting. */
807 unsigned char *src_base = src;
808 int c1 = *src++, c2;
809
810 switch (iso_code_class [c1])
811 {
812 case ISO_0x20_or_0x7F:
813 if (!coding->composing
814 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
815 {
816 /* This is SPACE or DEL. */
817 *dst++ = c1;
818 break;
819 }
820 /* This is a graphic character, we fall down ... */
821
822 case ISO_graphic_plane_0:
823 if (coding->composing == COMPOSING_WITH_RULE_RULE)
824 {
825 /* This is a composition rule. */
826 *dst++ = c1 | 0x80;
827 coding->composing = COMPOSING_WITH_RULE_TAIL;
828 }
829 else
830 DECODE_ISO_CHARACTER (charset0, c1);
831 break;
832
833 case ISO_0xA0_or_0xFF:
834 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
835 {
836 /* Invalid code. */
837 *dst++ = c1;
838 break;
839 }
840 /* This is a graphic character, we fall down ... */
841
842 case ISO_graphic_plane_1:
843 DECODE_ISO_CHARACTER (charset1, c1);
844 break;
845
846 case ISO_control_code:
847 /* All ISO2022 control characters in this class have the
848 same representation in Emacs internal format. */
849 *dst++ = c1;
850 break;
851
852 case ISO_carriage_return:
853 if (coding->eol_type == CODING_EOL_CR)
854 {
855 *dst++ = '\n';
856 }
857 else if (coding->eol_type == CODING_EOL_CRLF)
858 {
859 ONE_MORE_BYTE (c1);
860 if (c1 == ISO_CODE_LF)
861 *dst++ = '\n';
862 else
863 {
864 src--;
865 *dst++ = c1;
866 }
867 }
868 else
869 {
870 *dst++ = c1;
871 }
872 break;
873
874 case ISO_shift_out:
875 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
876 goto label_invalid_escape_sequence;
877 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
878 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
879 break;
880
881 case ISO_shift_in:
882 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
883 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
884 break;
885
886 case ISO_single_shift_2_7:
887 case ISO_single_shift_2:
888 /* SS2 is handled as an escape sequence of ESC 'N' */
889 c1 = 'N';
890 goto label_escape_sequence;
891
892 case ISO_single_shift_3:
893 /* SS2 is handled as an escape sequence of ESC 'O' */
894 c1 = 'O';
895 goto label_escape_sequence;
896
897 case ISO_control_sequence_introducer:
898 /* CSI is handled as an escape sequence of ESC '[' ... */
899 c1 = '[';
900 goto label_escape_sequence;
901
902 case ISO_escape:
903 ONE_MORE_BYTE (c1);
904 label_escape_sequence:
905 /* Escape sequences handled by Emacs are invocation,
906 designation, direction specification, and character
907 composition specification. */
908 switch (c1)
909 {
910 case '&': /* revision of following character set */
911 ONE_MORE_BYTE (c1);
912 if (!(c1 >= '@' && c1 <= '~'))
913 goto label_invalid_escape_sequence;
914 ONE_MORE_BYTE (c1);
915 if (c1 != ISO_CODE_ESC)
916 goto label_invalid_escape_sequence;
917 ONE_MORE_BYTE (c1);
918 goto label_escape_sequence;
919
920 case '$': /* designation of 2-byte character set */
921 ONE_MORE_BYTE (c1);
922 if (c1 >= '@' && c1 <= 'B')
923 { /* designation of JISX0208.1978, GB2312.1980,
924 or JISX0208.1980 */
925 DECODE_DESIGNATION (0, 2, 94, c1);
926 }
927 else if (c1 >= 0x28 && c1 <= 0x2B)
928 { /* designation of DIMENSION2_CHARS94 character set */
929 ONE_MORE_BYTE (c2);
930 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
931 }
932 else if (c1 >= 0x2C && c1 <= 0x2F)
933 { /* designation of DIMENSION2_CHARS96 character set */
934 ONE_MORE_BYTE (c2);
935 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
936 }
937 else
938 goto label_invalid_escape_sequence;
939 break;
940
941 case 'n': /* invocation of locking-shift-2 */
942 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
943 goto label_invalid_escape_sequence;
944 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
945 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
946 break;
947
948 case 'o': /* invocation of locking-shift-3 */
949 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
950 goto label_invalid_escape_sequence;
951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
953 break;
954
955 case 'N': /* invocation of single-shift-2 */
956 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
957 goto label_invalid_escape_sequence;
958 ONE_MORE_BYTE (c1);
959 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
960 DECODE_ISO_CHARACTER (charset, c1);
961 break;
962
963 case 'O': /* invocation of single-shift-3 */
964 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
965 goto label_invalid_escape_sequence;
966 ONE_MORE_BYTE (c1);
967 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
968 DECODE_ISO_CHARACTER (charset, c1);
969 break;
970
971 case '0': /* start composing without embeded rules */
972 coding->composing = COMPOSING_NO_RULE_HEAD;
973 break;
974
975 case '1': /* end composing */
976 coding->composing = COMPOSING_NO;
977 break;
978
979 case '2': /* start composing with embeded rules */
980 coding->composing = COMPOSING_WITH_RULE_HEAD;
981 break;
982
983 case '[': /* specification of direction */
984 /* For the moment, nested direction is not supported.
985 So, the value of `coding->direction' is 0 or 1: 0
986 means left-to-right, 1 means right-to-left. */
987 ONE_MORE_BYTE (c1);
988 switch (c1)
989 {
990 case ']': /* end of the current direction */
991 coding->direction = 0;
992
993 case '0': /* end of the current direction */
994 case '1': /* start of left-to-right direction */
995 ONE_MORE_BYTE (c1);
996 if (c1 == ']')
997 coding->direction = 0;
998 else
999 goto label_invalid_escape_sequence;
1000 break;
1001
1002 case '2': /* start of right-to-left direction */
1003 ONE_MORE_BYTE (c1);
1004 if (c1 == ']')
1005 coding->direction= 1;
1006 else
1007 goto label_invalid_escape_sequence;
1008 break;
1009
1010 default:
1011 goto label_invalid_escape_sequence;
1012 }
1013 break;
1014
1015 default:
1016 if (c1 >= 0x28 && c1 <= 0x2B)
1017 { /* designation of DIMENSION1_CHARS94 character set */
1018 ONE_MORE_BYTE (c2);
1019 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1020 }
1021 else if (c1 >= 0x2C && c1 <= 0x2F)
1022 { /* designation of DIMENSION1_CHARS96 character set */
1023 ONE_MORE_BYTE (c2);
1024 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1025 }
1026 else
1027 {
1028 goto label_invalid_escape_sequence;
1029 }
1030 }
1031 /* We must update these variables now. */
1032 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1033 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1034 break;
1035
1036 label_invalid_escape_sequence:
1037 {
1038 int length = src - src_base;
1039
1040 bcopy (src_base, dst, length);
1041 dst += length;
1042 }
1043 }
1044 continue;
1045
1046 label_end_of_loop:
1047 coding->carryover_size = src - src_base;
1048 bcopy (src_base, coding->carryover, coding->carryover_size);
1049 src = src_base;
1050 break;
1051 }
1052
1053 /* If this is the last block of the text to be decoded, we had
1054 better just flush out all remaining codes in the text although
1055 they are not valid characters. */
1056 if (coding->last_block)
1057 {
1058 bcopy (src, dst, src_end - src);
1059 dst += (src_end - src);
1060 src = src_end;
1061 }
1062 *consumed = src - source;
1063 return dst - destination;
1064 }
1065
1066 /* ISO2022 encoding stuff. */
1067
1068 /*
1069 It is not enough to say just "ISO2022" on encoding, we have to
1070 specify more details. In Emacs, each coding-system of ISO2022
1071 variant has the following specifications:
1072 1. Initial designation to G0 thru G3.
1073 2. Allows short-form designation?
1074 3. ASCII should be designated to G0 before control characters?
1075 4. ASCII should be designated to G0 at end of line?
1076 5. 7-bit environment or 8-bit environment?
1077 6. Use locking-shift?
1078 7. Use Single-shift?
1079 And the following two are only for Japanese:
1080 8. Use ASCII in place of JIS0201-1976-Roman?
1081 9. Use JISX0208-1983 in place of JISX0208-1978?
1082 These specifications are encoded in `coding->flags' as flag bits
1083 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1084 details.
1085 */
1086
1087 /* Produce codes (escape sequence) for designating CHARSET to graphic
1088 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1089 the coding system CODING allows, produce designation sequence of
1090 short-form. */
1091
1092 #define ENCODE_DESIGNATION(charset, reg, coding) \
1093 do { \
1094 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1095 char *intermediate_char_94 = "()*+"; \
1096 char *intermediate_char_96 = ",-./"; \
1097 Lisp_Object temp \
1098 = Fassq (make_number (charset), Vcharset_revision_alist); \
1099 if (! NILP (temp)) \
1100 { \
1101 *dst++ = ISO_CODE_ESC; \
1102 *dst++ = '&'; \
1103 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1104 } \
1105 *dst++ = ISO_CODE_ESC; \
1106 if (CHARSET_DIMENSION (charset) == 1) \
1107 { \
1108 if (CHARSET_CHARS (charset) == 94) \
1109 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1110 else \
1111 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1112 } \
1113 else \
1114 { \
1115 *dst++ = '$'; \
1116 if (CHARSET_CHARS (charset) == 94) \
1117 { \
1118 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1119 || reg != 0 \
1120 || final_char < '@' || final_char > 'B') \
1121 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1122 } \
1123 else \
1124 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1125 } \
1126 *dst++ = final_char; \
1127 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1128 } while (0)
1129
1130 /* The following two macros produce codes (control character or escape
1131 sequence) for ISO2022 single-shift functions (single-shift-2 and
1132 single-shift-3). */
1133
1134 #define ENCODE_SINGLE_SHIFT_2 \
1135 do { \
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1137 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1138 else \
1139 *dst++ = ISO_CODE_SS2; \
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1141 } while (0)
1142
1143 #define ENCODE_SINGLE_SHIFT_3 \
1144 do { \
1145 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1146 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1147 else \
1148 *dst++ = ISO_CODE_SS3; \
1149 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1150 } while (0)
1151
1152 /* The following four macros produce codes (control character or
1153 escape sequence) for ISO2022 locking-shift functions (shift-in,
1154 shift-out, locking-shift-2, and locking-shift-3). */
1155
1156 #define ENCODE_SHIFT_IN \
1157 do { \
1158 *dst++ = ISO_CODE_SI; \
1159 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1160 } while (0)
1161
1162 #define ENCODE_SHIFT_OUT \
1163 do { \
1164 *dst++ = ISO_CODE_SO; \
1165 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1166 } while (0)
1167
1168 #define ENCODE_LOCKING_SHIFT_2 \
1169 do { \
1170 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1171 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1172 } while (0)
1173
1174 #define ENCODE_LOCKING_SHIFT_3 \
1175 do { \
1176 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1177 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1178 } while (0)
1179
1180 /* Produce codes for a DIMENSION1 character whose character set is
1181 CHARSET and whose position-code is C1. Designation and invocation
1182 sequences are also produced in advance if necessary. */
1183
1184
1185 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1186 do { \
1187 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1188 { \
1189 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1190 *dst++ = c1 & 0x7F; \
1191 else \
1192 *dst++ = c1 | 0x80; \
1193 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1194 break; \
1195 } \
1196 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1197 { \
1198 *dst++ = c1 & 0x7F; \
1199 break; \
1200 } \
1201 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1202 { \
1203 *dst++ = c1 | 0x80; \
1204 break; \
1205 } \
1206 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1207 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) \
1208 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) \
1209 { \
1210 /* We should not encode this character, instead produce one or \
1211 two `?'s. */ \
1212 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1213 if (CHARSET_WIDTH (charset) == 2) \
1214 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1215 break; \
1216 } \
1217 else \
1218 /* Since CHARSET is not yet invoked to any graphic planes, we \
1219 must invoke it, or, at first, designate it to some graphic \
1220 register. Then repeat the loop to actually produce the \
1221 character. */ \
1222 dst = encode_invocation_designation (charset, coding, dst); \
1223 } while (1)
1224
1225 /* Produce codes for a DIMENSION2 character whose character set is
1226 CHARSET and whose position-codes are C1 and C2. Designation and
1227 invocation codes are also produced in advance if necessary. */
1228
1229 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1230 do { \
1231 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1232 { \
1233 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1234 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1235 else \
1236 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1237 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1238 break; \
1239 } \
1240 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1241 { \
1242 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1243 break; \
1244 } \
1245 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1246 { \
1247 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1248 break; \
1249 } \
1250 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1251 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) \
1252 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) \
1253 { \
1254 /* We should not encode this character, instead produce one or \
1255 two `?'s. */ \
1256 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1257 if (CHARSET_WIDTH (charset) == 2) \
1258 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1259 break; \
1260 } \
1261 else \
1262 /* Since CHARSET is not yet invoked to any graphic planes, we \
1263 must invoke it, or, at first, designate it to some graphic \
1264 register. Then repeat the loop to actually produce the \
1265 character. */ \
1266 dst = encode_invocation_designation (charset, coding, dst); \
1267 } while (1)
1268
1269 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1270 do { \
1271 int c_alt, charset_alt; \
1272 if (!NILP (unification_table) \
1273 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1274 >= 0)) \
1275 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1276 else \
1277 charset_alt = charset; \
1278 if (CHARSET_DIMENSION (charset_alt) == 1) \
1279 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1280 else \
1281 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1282 } while (0)
1283
1284 /* Produce designation and invocation codes at a place pointed by DST
1285 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1286 Return new DST. */
1287
1288 unsigned char *
1289 encode_invocation_designation (charset, coding, dst)
1290 int charset;
1291 struct coding_system *coding;
1292 unsigned char *dst;
1293 {
1294 int reg; /* graphic register number */
1295
1296 /* At first, check designations. */
1297 for (reg = 0; reg < 4; reg++)
1298 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1299 break;
1300
1301 if (reg >= 4)
1302 {
1303 /* CHARSET is not yet designated to any graphic registers. */
1304 /* At first check the requested designation. */
1305 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1306 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1307 /* Since CHARSET requests no special designation, designate it
1308 to graphic register 0. */
1309 reg = 0;
1310
1311 ENCODE_DESIGNATION (charset, reg, coding);
1312 }
1313
1314 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1315 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1316 {
1317 /* Since the graphic register REG is not invoked to any graphic
1318 planes, invoke it to graphic plane 0. */
1319 switch (reg)
1320 {
1321 case 0: /* graphic register 0 */
1322 ENCODE_SHIFT_IN;
1323 break;
1324
1325 case 1: /* graphic register 1 */
1326 ENCODE_SHIFT_OUT;
1327 break;
1328
1329 case 2: /* graphic register 2 */
1330 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1331 ENCODE_SINGLE_SHIFT_2;
1332 else
1333 ENCODE_LOCKING_SHIFT_2;
1334 break;
1335
1336 case 3: /* graphic register 3 */
1337 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1338 ENCODE_SINGLE_SHIFT_3;
1339 else
1340 ENCODE_LOCKING_SHIFT_3;
1341 break;
1342 }
1343 }
1344 return dst;
1345 }
1346
1347 /* The following two macros produce codes for indicating composition. */
1348 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1349 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1350 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1351
1352 /* The following three macros produce codes for indicating direction
1353 of text. */
1354 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1355 do { \
1356 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1357 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1358 else \
1359 *dst++ = ISO_CODE_CSI; \
1360 } while (0)
1361
1362 #define ENCODE_DIRECTION_R2L \
1363 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1364
1365 #define ENCODE_DIRECTION_L2R \
1366 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1367
1368 /* Produce codes for designation and invocation to reset the graphic
1369 planes and registers to initial state. */
1370 #define ENCODE_RESET_PLANE_AND_REGISTER \
1371 do { \
1372 int reg; \
1373 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1374 ENCODE_SHIFT_IN; \
1375 for (reg = 0; reg < 4; reg++) \
1376 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1377 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1378 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1379 ENCODE_DESIGNATION \
1380 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1381 } while (0)
1382
1383 /* Produce designation sequences of charsets in the line started from
1384 *SRC to a place pointed by DSTP.
1385
1386 If the current block ends before any end-of-line, we may fail to
1387 find all the necessary *designations. */
1388 encode_designation_at_bol (coding, table, src, src_end, dstp)
1389 struct coding_system *coding;
1390 Lisp_Object table;
1391 unsigned char *src, *src_end, **dstp;
1392 {
1393 int charset, c, found = 0, reg;
1394 /* Table of charsets to be designated to each graphic register. */
1395 int r[4];
1396 unsigned char *dst = *dstp;
1397
1398 for (reg = 0; reg < 4; reg++)
1399 r[reg] = -1;
1400
1401 while (src < src_end && *src != '\n' && found < 4)
1402 {
1403 int bytes = BYTES_BY_CHAR_HEAD (*src);
1404
1405 if (NILP (table))
1406 charset = CHARSET_AT (src);
1407 else
1408 {
1409 int c_alt, c1, c2;
1410
1411 SPLIT_STRING(src, bytes, charset, c1, c2);
1412 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1413 charset = CHAR_CHARSET (c_alt);
1414 }
1415
1416 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1417 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1418 {
1419 found++;
1420 r[reg] = charset;
1421 }
1422
1423 src += bytes;
1424 }
1425
1426 if (found)
1427 {
1428 for (reg = 0; reg < 4; reg++)
1429 if (r[reg] >= 0
1430 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1431 ENCODE_DESIGNATION (r[reg], reg, coding);
1432 *dstp = dst;
1433 }
1434 }
1435
1436 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1437
1438 int
1439 encode_coding_iso2022 (coding, source, destination,
1440 src_bytes, dst_bytes, consumed)
1441 struct coding_system *coding;
1442 unsigned char *source, *destination;
1443 int src_bytes, dst_bytes;
1444 int *consumed;
1445 {
1446 unsigned char *src = source;
1447 unsigned char *src_end = source + src_bytes;
1448 unsigned char *dst = destination;
1449 unsigned char *dst_end = destination + dst_bytes;
1450 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1451 from DST_END to assure overflow checking is necessary only at the
1452 head of loop. */
1453 unsigned char *adjusted_dst_end = dst_end - 19;
1454 Lisp_Object unification_table
1455 = coding->character_unification_table_for_encode;
1456
1457 if (!NILP (Venable_character_unification) && NILP (unification_table))
1458 unification_table = Vstandard_character_unification_table_for_encode;
1459
1460 while (src < src_end && dst < adjusted_dst_end)
1461 {
1462 /* SRC_BASE remembers the start position in source in each loop.
1463 The loop will be exited when there's not enough source text
1464 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1465 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1466 reset to SRC_BASE before exiting. */
1467 unsigned char *src_base = src;
1468 int charset, c1, c2, c3, c4;
1469
1470 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1471 && CODING_SPEC_ISO_BOL (coding))
1472 {
1473 /* We have to produce designation sequences if any now. */
1474 encode_designation_at_bol (coding, unification_table,
1475 src, src_end, &dst);
1476 CODING_SPEC_ISO_BOL (coding) = 0;
1477 }
1478
1479 c1 = *src++;
1480 /* If we are seeing a component of a composite character, we are
1481 seeing a leading-code specially encoded for composition, or a
1482 composition rule if composing with rule. We must set C1
1483 to a normal leading-code or an ASCII code. If we are not at
1484 a composed character, we must reset the composition state. */
1485 if (COMPOSING_P (coding->composing))
1486 {
1487 if (c1 < 0xA0)
1488 {
1489 /* We are not in a composite character any longer. */
1490 coding->composing = COMPOSING_NO;
1491 ENCODE_COMPOSITION_END;
1492 }
1493 else
1494 {
1495 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1496 {
1497 *dst++ = c1 & 0x7F;
1498 coding->composing = COMPOSING_WITH_RULE_HEAD;
1499 continue;
1500 }
1501 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1502 coding->composing = COMPOSING_WITH_RULE_RULE;
1503 if (c1 == 0xA0)
1504 {
1505 /* This is an ASCII component. */
1506 ONE_MORE_BYTE (c1);
1507 c1 &= 0x7F;
1508 }
1509 else
1510 /* This is a leading-code of non ASCII component. */
1511 c1 -= 0x20;
1512 }
1513 }
1514
1515 /* Now encode one character. C1 is a control character, an
1516 ASCII character, or a leading-code of multi-byte character. */
1517 switch (emacs_code_class[c1])
1518 {
1519 case EMACS_ascii_code:
1520 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1521 break;
1522
1523 case EMACS_control_code:
1524 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1525 ENCODE_RESET_PLANE_AND_REGISTER;
1526 *dst++ = c1;
1527 break;
1528
1529 case EMACS_carriage_return_code:
1530 if (!coding->selective)
1531 {
1532 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1533 ENCODE_RESET_PLANE_AND_REGISTER;
1534 *dst++ = c1;
1535 break;
1536 }
1537 /* fall down to treat '\r' as '\n' ... */
1538
1539 case EMACS_linefeed_code:
1540 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1541 ENCODE_RESET_PLANE_AND_REGISTER;
1542 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1543 bcopy (coding->spec.iso2022.initial_designation,
1544 coding->spec.iso2022.current_designation,
1545 sizeof coding->spec.iso2022.initial_designation);
1546 if (coding->eol_type == CODING_EOL_LF
1547 || coding->eol_type == CODING_EOL_UNDECIDED)
1548 *dst++ = ISO_CODE_LF;
1549 else if (coding->eol_type == CODING_EOL_CRLF)
1550 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1551 else
1552 *dst++ = ISO_CODE_CR;
1553 CODING_SPEC_ISO_BOL (coding) = 1;
1554 break;
1555
1556 case EMACS_leading_code_2:
1557 ONE_MORE_BYTE (c2);
1558 if (c2 < 0xA0)
1559 {
1560 /* invalid sequence */
1561 *dst++ = c1;
1562 *dst++ = c2;
1563 }
1564 else
1565 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1566 break;
1567
1568 case EMACS_leading_code_3:
1569 TWO_MORE_BYTES (c2, c3);
1570 if (c2 < 0xA0 || c3 < 0xA0)
1571 {
1572 /* invalid sequence */
1573 *dst++ = c1;
1574 *dst++ = c2;
1575 *dst++ = c3;
1576 }
1577 else if (c1 < LEADING_CODE_PRIVATE_11)
1578 ENCODE_ISO_CHARACTER (c1, c2, c3);
1579 else
1580 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1581 break;
1582
1583 case EMACS_leading_code_4:
1584 THREE_MORE_BYTES (c2, c3, c4);
1585 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1586 {
1587 /* invalid sequence */
1588 *dst++ = c1;
1589 *dst++ = c2;
1590 *dst++ = c3;
1591 *dst++ = c4;
1592 }
1593 else
1594 ENCODE_ISO_CHARACTER (c2, c3, c4);
1595 break;
1596
1597 case EMACS_leading_code_composition:
1598 ONE_MORE_BYTE (c2);
1599 if (c2 < 0xA0)
1600 {
1601 /* invalid sequence */
1602 *dst++ = c1;
1603 *dst++ = c2;
1604 }
1605 else if (c2 == 0xFF)
1606 {
1607 coding->composing = COMPOSING_WITH_RULE_HEAD;
1608 ENCODE_COMPOSITION_WITH_RULE_START;
1609 }
1610 else
1611 {
1612 /* Rewind one byte because it is a character code of
1613 composition elements. */
1614 src--;
1615 coding->composing = COMPOSING_NO_RULE_HEAD;
1616 ENCODE_COMPOSITION_NO_RULE_START;
1617 }
1618 break;
1619
1620 case EMACS_invalid_code:
1621 *dst++ = c1;
1622 break;
1623 }
1624 continue;
1625 label_end_of_loop:
1626 /* We reach here because the source date ends not at character
1627 boundary. */
1628 coding->carryover_size = src_end - src_base;
1629 bcopy (src_base, coding->carryover, coding->carryover_size);
1630 src = src_end;
1631 break;
1632 }
1633
1634 /* If this is the last block of the text to be encoded, we must
1635 reset graphic planes and registers to the initial state. */
1636 if (src >= src_end && coding->last_block)
1637 {
1638 ENCODE_RESET_PLANE_AND_REGISTER;
1639 if (coding->carryover_size > 0
1640 && coding->carryover_size < (dst_end - dst))
1641 {
1642 bcopy (coding->carryover, dst, coding->carryover_size);
1643 dst += coding->carryover_size;
1644 coding->carryover_size = 0;
1645 }
1646 }
1647 *consumed = src - source;
1648 return dst - destination;
1649 }
1650
1651 \f
1652 /*** 4. SJIS and BIG5 handlers ***/
1653
1654 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1655 quite widely. So, for the moment, Emacs supports them in the bare
1656 C code. But, in the future, they may be supported only by CCL. */
1657
1658 /* SJIS is a coding system encoding three character sets: ASCII, right
1659 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1660 as is. A character of charset katakana-jisx0201 is encoded by
1661 "position-code + 0x80". A character of charset japanese-jisx0208
1662 is encoded in 2-byte but two position-codes are divided and shifted
1663 so that it fit in the range below.
1664
1665 --- CODE RANGE of SJIS ---
1666 (character set) (range)
1667 ASCII 0x00 .. 0x7F
1668 KATAKANA-JISX0201 0xA0 .. 0xDF
1669 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1670 (2nd byte) 0x40 .. 0xFF
1671 -------------------------------
1672
1673 */
1674
1675 /* BIG5 is a coding system encoding two character sets: ASCII and
1676 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1677 character set and is encoded in two-byte.
1678
1679 --- CODE RANGE of BIG5 ---
1680 (character set) (range)
1681 ASCII 0x00 .. 0x7F
1682 Big5 (1st byte) 0xA1 .. 0xFE
1683 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1684 --------------------------
1685
1686 Since the number of characters in Big5 is larger than maximum
1687 characters in Emacs' charset (96x96), it can't be handled as one
1688 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1689 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1690 contains frequently used characters and the latter contains less
1691 frequently used characters. */
1692
1693 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1694 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1695 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1696 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1697
1698 /* Number of Big5 characters which have the same code in 1st byte. */
1699 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1700
1701 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1702 do { \
1703 unsigned int temp \
1704 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1705 if (b1 < 0xC9) \
1706 charset = charset_big5_1; \
1707 else \
1708 { \
1709 charset = charset_big5_2; \
1710 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1711 } \
1712 c1 = temp / (0xFF - 0xA1) + 0x21; \
1713 c2 = temp % (0xFF - 0xA1) + 0x21; \
1714 } while (0)
1715
1716 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1717 do { \
1718 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1719 if (charset == charset_big5_2) \
1720 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1721 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1722 b2 = temp % BIG5_SAME_ROW; \
1723 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1724 } while (0)
1725
1726 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1727 do { \
1728 int c_alt, charset_alt = (charset); \
1729 if (!NILP (unification_table) \
1730 && ((c_alt = unify_char (unification_table, \
1731 -1, (charset), c1, c2)) >= 0)) \
1732 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1733 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1734 DECODE_CHARACTER_ASCII (c1); \
1735 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1736 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1737 else \
1738 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1739 } while (0)
1740
1741 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1742 do { \
1743 int c_alt, charset_alt; \
1744 if (!NILP (unification_table) \
1745 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1746 >= 0)) \
1747 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1748 else \
1749 charset_alt = charset; \
1750 if (charset_alt == charset_ascii) \
1751 *dst++ = c1; \
1752 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1753 { \
1754 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1755 *dst++ = c1; \
1756 else \
1757 *dst++ = charset_alt, *dst++ = c1; \
1758 } \
1759 else \
1760 { \
1761 c1 &= 0x7F, c2 &= 0x7F; \
1762 if (sjis_p && charset_alt == charset_jisx0208) \
1763 { \
1764 unsigned char s1, s2; \
1765 \
1766 ENCODE_SJIS (c1, c2, s1, s2); \
1767 *dst++ = s1, *dst++ = s2; \
1768 } \
1769 else if (!sjis_p \
1770 && (charset_alt == charset_big5_1 \
1771 || charset_alt == charset_big5_2)) \
1772 { \
1773 unsigned char b1, b2; \
1774 \
1775 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1776 *dst++ = b1, *dst++ = b2; \
1777 } \
1778 else \
1779 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1780 } \
1781 } while (0);
1782
1783 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1784 Check if a text is encoded in SJIS. If it is, return
1785 CODING_CATEGORY_MASK_SJIS, else return 0. */
1786
1787 int
1788 detect_coding_sjis (src, src_end)
1789 unsigned char *src, *src_end;
1790 {
1791 unsigned char c;
1792
1793 while (src < src_end)
1794 {
1795 c = *src++;
1796 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1797 return 0;
1798 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1799 {
1800 if (src < src_end && *src++ < 0x40)
1801 return 0;
1802 }
1803 }
1804 return CODING_CATEGORY_MASK_SJIS;
1805 }
1806
1807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1808 Check if a text is encoded in BIG5. If it is, return
1809 CODING_CATEGORY_MASK_BIG5, else return 0. */
1810
1811 int
1812 detect_coding_big5 (src, src_end)
1813 unsigned char *src, *src_end;
1814 {
1815 unsigned char c;
1816
1817 while (src < src_end)
1818 {
1819 c = *src++;
1820 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1821 return 0;
1822 if (c >= 0xA1)
1823 {
1824 if (src >= src_end)
1825 break;
1826 c = *src++;
1827 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1828 return 0;
1829 }
1830 }
1831 return CODING_CATEGORY_MASK_BIG5;
1832 }
1833
1834 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1835 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1836
1837 int
1838 decode_coding_sjis_big5 (coding, source, destination,
1839 src_bytes, dst_bytes, consumed, sjis_p)
1840 struct coding_system *coding;
1841 unsigned char *source, *destination;
1842 int src_bytes, dst_bytes;
1843 int *consumed;
1844 int sjis_p;
1845 {
1846 unsigned char *src = source;
1847 unsigned char *src_end = source + src_bytes;
1848 unsigned char *dst = destination;
1849 unsigned char *dst_end = destination + dst_bytes;
1850 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1851 from DST_END to assure overflow checking is necessary only at the
1852 head of loop. */
1853 unsigned char *adjusted_dst_end = dst_end - 3;
1854 Lisp_Object unification_table
1855 = coding->character_unification_table_for_decode;
1856
1857 if (!NILP (Venable_character_unification) && NILP (unification_table))
1858 unification_table = Vstandard_character_unification_table_for_decode;
1859
1860 while (src < src_end && dst < adjusted_dst_end)
1861 {
1862 /* SRC_BASE remembers the start position in source in each loop.
1863 The loop will be exited when there's not enough source text
1864 to analyze two-byte character (within macro ONE_MORE_BYTE).
1865 In that case, SRC is reset to SRC_BASE before exiting. */
1866 unsigned char *src_base = src;
1867 unsigned char c1 = *src++, c2, c3, c4;
1868
1869 if (c1 == '\r')
1870 {
1871 if (coding->eol_type == CODING_EOL_CRLF)
1872 {
1873 ONE_MORE_BYTE (c2);
1874 if (c2 == '\n')
1875 *dst++ = c2;
1876 else
1877 /* To process C2 again, SRC is subtracted by 1. */
1878 *dst++ = c1, src--;
1879 }
1880 else
1881 *dst++ = c1;
1882 }
1883 else if (c1 < 0x20)
1884 *dst++ = c1;
1885 else if (c1 < 0x80)
1886 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1887 else if (c1 < 0xA0 || c1 >= 0xE0)
1888 {
1889 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1890 if (sjis_p)
1891 {
1892 ONE_MORE_BYTE (c2);
1893 DECODE_SJIS (c1, c2, c3, c4);
1894 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1895 }
1896 else if (c1 >= 0xE0 && c1 < 0xFF)
1897 {
1898 int charset;
1899
1900 ONE_MORE_BYTE (c2);
1901 DECODE_BIG5 (c1, c2, charset, c3, c4);
1902 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1903 }
1904 else /* Invalid code */
1905 *dst++ = c1;
1906 }
1907 else
1908 {
1909 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1910 if (sjis_p)
1911 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1912 else
1913 {
1914 int charset;
1915
1916 ONE_MORE_BYTE (c2);
1917 DECODE_BIG5 (c1, c2, charset, c3, c4);
1918 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1919 }
1920 }
1921 continue;
1922
1923 label_end_of_loop:
1924 coding->carryover_size = src - src_base;
1925 bcopy (src_base, coding->carryover, coding->carryover_size);
1926 src = src_base;
1927 break;
1928 }
1929
1930 *consumed = src - source;
1931 return dst - destination;
1932 }
1933
1934 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1935 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1936 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1937 sure that all these charsets are registered as official charset
1938 (i.e. do not have extended leading-codes). Characters of other
1939 charsets are produced without any encoding. If SJIS_P is 1, encode
1940 SJIS text, else encode BIG5 text. */
1941
1942 int
1943 encode_coding_sjis_big5 (coding, source, destination,
1944 src_bytes, dst_bytes, consumed, sjis_p)
1945 struct coding_system *coding;
1946 unsigned char *source, *destination;
1947 int src_bytes, dst_bytes;
1948 int *consumed;
1949 int sjis_p;
1950 {
1951 unsigned char *src = source;
1952 unsigned char *src_end = source + src_bytes;
1953 unsigned char *dst = destination;
1954 unsigned char *dst_end = destination + dst_bytes;
1955 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1956 from DST_END to assure overflow checking is necessary only at the
1957 head of loop. */
1958 unsigned char *adjusted_dst_end = dst_end - 1;
1959 Lisp_Object unification_table
1960 = coding->character_unification_table_for_encode;
1961
1962 if (!NILP (Venable_character_unification) && NILP (unification_table))
1963 unification_table = Vstandard_character_unification_table_for_encode;
1964
1965 while (src < src_end && dst < adjusted_dst_end)
1966 {
1967 /* SRC_BASE remembers the start position in source in each loop.
1968 The loop will be exited when there's not enough source text
1969 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1970 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1971 before exiting. */
1972 unsigned char *src_base = src;
1973 unsigned char c1 = *src++, c2, c3, c4;
1974
1975 if (coding->composing)
1976 {
1977 if (c1 == 0xA0)
1978 {
1979 ONE_MORE_BYTE (c1);
1980 c1 &= 0x7F;
1981 }
1982 else if (c1 >= 0xA0)
1983 c1 -= 0x20;
1984 else
1985 coding->composing = 0;
1986 }
1987
1988 switch (emacs_code_class[c1])
1989 {
1990 case EMACS_ascii_code:
1991 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1992 break;
1993
1994 case EMACS_control_code:
1995 *dst++ = c1;
1996 break;
1997
1998 case EMACS_carriage_return_code:
1999 if (!coding->selective)
2000 {
2001 *dst++ = c1;
2002 break;
2003 }
2004 /* fall down to treat '\r' as '\n' ... */
2005
2006 case EMACS_linefeed_code:
2007 if (coding->eol_type == CODING_EOL_LF
2008 || coding->eol_type == CODING_EOL_UNDECIDED)
2009 *dst++ = '\n';
2010 else if (coding->eol_type == CODING_EOL_CRLF)
2011 *dst++ = '\r', *dst++ = '\n';
2012 else
2013 *dst++ = '\r';
2014 break;
2015
2016 case EMACS_leading_code_2:
2017 ONE_MORE_BYTE (c2);
2018 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2019 break;
2020
2021 case EMACS_leading_code_3:
2022 TWO_MORE_BYTES (c2, c3);
2023 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2024 break;
2025
2026 case EMACS_leading_code_4:
2027 THREE_MORE_BYTES (c2, c3, c4);
2028 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2029 break;
2030
2031 case EMACS_leading_code_composition:
2032 coding->composing = 1;
2033 break;
2034
2035 default: /* i.e. case EMACS_invalid_code: */
2036 *dst++ = c1;
2037 }
2038 continue;
2039
2040 label_end_of_loop:
2041 coding->carryover_size = src_end - src_base;
2042 bcopy (src_base, coding->carryover, coding->carryover_size);
2043 src = src_end;
2044 break;
2045 }
2046
2047 *consumed = src - source;
2048 return dst - destination;
2049 }
2050
2051 \f
2052 /*** 5. End-of-line handlers ***/
2053
2054 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2055 This function is called only when `coding->eol_type' is
2056 CODING_EOL_CRLF or CODING_EOL_CR. */
2057
2058 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2059 struct coding_system *coding;
2060 unsigned char *source, *destination;
2061 int src_bytes, dst_bytes;
2062 int *consumed;
2063 {
2064 unsigned char *src = source;
2065 unsigned char *src_end = source + src_bytes;
2066 unsigned char *dst = destination;
2067 unsigned char *dst_end = destination + dst_bytes;
2068 int produced;
2069
2070 switch (coding->eol_type)
2071 {
2072 case CODING_EOL_CRLF:
2073 {
2074 /* Since the maximum bytes produced by each loop is 2, we
2075 subtract 1 from DST_END to assure overflow checking is
2076 necessary only at the head of loop. */
2077 unsigned char *adjusted_dst_end = dst_end - 1;
2078
2079 while (src < src_end && dst < adjusted_dst_end)
2080 {
2081 unsigned char *src_base = src;
2082 unsigned char c = *src++;
2083 if (c == '\r')
2084 {
2085 ONE_MORE_BYTE (c);
2086 if (c != '\n')
2087 *dst++ = '\r';
2088 *dst++ = c;
2089 }
2090 else
2091 *dst++ = c;
2092 continue;
2093
2094 label_end_of_loop:
2095 coding->carryover_size = src - src_base;
2096 bcopy (src_base, coding->carryover, coding->carryover_size);
2097 src = src_base;
2098 break;
2099 }
2100 *consumed = src - source;
2101 produced = dst - destination;
2102 break;
2103 }
2104
2105 case CODING_EOL_CR:
2106 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2107 bcopy (source, destination, produced);
2108 dst_end = destination + produced;
2109 while (dst < dst_end)
2110 if (*dst++ == '\r') dst[-1] = '\n';
2111 *consumed = produced;
2112 break;
2113
2114 default: /* i.e. case: CODING_EOL_LF */
2115 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2116 bcopy (source, destination, produced);
2117 *consumed = produced;
2118 break;
2119 }
2120
2121 return produced;
2122 }
2123
2124 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2125 format of end-of-line according to `coding->eol_type'. If
2126 `coding->selective' is 1, code '\r' in source text also means
2127 end-of-line. */
2128
2129 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2130 struct coding_system *coding;
2131 unsigned char *source, *destination;
2132 int src_bytes, dst_bytes;
2133 int *consumed;
2134 {
2135 unsigned char *src = source;
2136 unsigned char *dst = destination;
2137 int produced;
2138
2139 if (src_bytes <= 0)
2140 return 0;
2141
2142 switch (coding->eol_type)
2143 {
2144 case CODING_EOL_LF:
2145 case CODING_EOL_UNDECIDED:
2146 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2147 bcopy (source, destination, produced);
2148 if (coding->selective)
2149 {
2150 int i = produced;
2151 while (i--)
2152 if (*dst++ == '\r') dst[-1] = '\n';
2153 }
2154 *consumed = produced;
2155
2156 case CODING_EOL_CRLF:
2157 {
2158 unsigned char c;
2159 unsigned char *src_end = source + src_bytes;
2160 unsigned char *dst_end = destination + dst_bytes;
2161 /* Since the maximum bytes produced by each loop is 2, we
2162 subtract 1 from DST_END to assure overflow checking is
2163 necessary only at the head of loop. */
2164 unsigned char *adjusted_dst_end = dst_end - 1;
2165
2166 while (src < src_end && dst < adjusted_dst_end)
2167 {
2168 c = *src++;
2169 if (c == '\n' || (c == '\r' && coding->selective))
2170 *dst++ = '\r', *dst++ = '\n';
2171 else
2172 *dst++ = c;
2173 }
2174 produced = dst - destination;
2175 *consumed = src - source;
2176 break;
2177 }
2178
2179 default: /* i.e. case CODING_EOL_CR: */
2180 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2181 bcopy (source, destination, produced);
2182 {
2183 int i = produced;
2184 while (i--)
2185 if (*dst++ == '\n') dst[-1] = '\r';
2186 }
2187 *consumed = produced;
2188 }
2189
2190 return produced;
2191 }
2192
2193 \f
2194 /*** 6. C library functions ***/
2195
2196 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2197 has a property `coding-system'. The value of this property is a
2198 vector of length 5 (called as coding-vector). Among elements of
2199 this vector, the first (element[0]) and the fifth (element[4])
2200 carry important information for decoding/encoding. Before
2201 decoding/encoding, this information should be set in fields of a
2202 structure of type `coding_system'.
2203
2204 A value of property `coding-system' can be a symbol of another
2205 subsidiary coding-system. In that case, Emacs gets coding-vector
2206 from that symbol.
2207
2208 `element[0]' contains information to be set in `coding->type'. The
2209 value and its meaning is as follows:
2210
2211 0 -- coding_type_emacs_mule
2212 1 -- coding_type_sjis
2213 2 -- coding_type_iso2022
2214 3 -- coding_type_big5
2215 4 -- coding_type_ccl encoder/decoder written in CCL
2216 nil -- coding_type_no_conversion
2217 t -- coding_type_undecided (automatic conversion on decoding,
2218 no-conversion on encoding)
2219
2220 `element[4]' contains information to be set in `coding->flags' and
2221 `coding->spec'. The meaning varies by `coding->type'.
2222
2223 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2224 of length 32 (of which the first 13 sub-elements are used now).
2225 Meanings of these sub-elements are:
2226
2227 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2228 If the value is an integer of valid charset, the charset is
2229 assumed to be designated to graphic register N initially.
2230
2231 If the value is minus, it is a minus value of charset which
2232 reserves graphic register N, which means that the charset is
2233 not designated initially but should be designated to graphic
2234 register N just before encoding a character in that charset.
2235
2236 If the value is nil, graphic register N is never used on
2237 encoding.
2238
2239 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2240 Each value takes t or nil. See the section ISO2022 of
2241 `coding.h' for more information.
2242
2243 If `coding->type' is `coding_type_big5', element[4] is t to denote
2244 BIG5-ETen or nil to denote BIG5-HKU.
2245
2246 If `coding->type' takes the other value, element[4] is ignored.
2247
2248 Emacs Lisp's coding system also carries information about format of
2249 end-of-line in a value of property `eol-type'. If the value is
2250 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2251 means CODING_EOL_CR. If it is not integer, it should be a vector
2252 of subsidiary coding systems of which property `eol-type' has one
2253 of above values.
2254
2255 */
2256
2257 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2258 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2259 is setup so that no conversion is necessary and return -1, else
2260 return 0. */
2261
2262 int
2263 setup_coding_system (coding_system, coding)
2264 Lisp_Object coding_system;
2265 struct coding_system *coding;
2266 {
2267 Lisp_Object type, eol_type;
2268
2269 /* At first, set several fields to default values. */
2270 coding->require_flushing = 0;
2271 coding->last_block = 0;
2272 coding->selective = 0;
2273 coding->composing = 0;
2274 coding->direction = 0;
2275 coding->carryover_size = 0;
2276 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2277 coding->character_unification_table_for_decode = Qnil;
2278 coding->character_unification_table_for_encode = Qnil;
2279
2280 Vlast_coding_system_used = coding->symbol = coding_system;
2281 eol_type = Qnil;
2282 /* Get value of property `coding-system' until we get a vector.
2283 While doing that, also get values of properties
2284 `post-read-conversion', `pre-write-conversion',
2285 `character-unification-table-for-decode',
2286 `character-unification-table-for-encode' and `eol-type'. */
2287 while (!NILP (coding_system) && SYMBOLP (coding_system))
2288 {
2289 if (NILP (coding->post_read_conversion))
2290 coding->post_read_conversion = Fget (coding_system,
2291 Qpost_read_conversion);
2292 if (NILP (coding->pre_write_conversion))
2293 coding->pre_write_conversion = Fget (coding_system,
2294 Qpre_write_conversion);
2295 if (!inhibit_eol_conversion && NILP (eol_type))
2296 eol_type = Fget (coding_system, Qeol_type);
2297
2298 if (NILP (coding->character_unification_table_for_decode))
2299 coding->character_unification_table_for_decode
2300 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2301
2302 if (NILP (coding->character_unification_table_for_encode))
2303 coding->character_unification_table_for_encode
2304 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2305
2306 coding_system = Fget (coding_system, Qcoding_system);
2307 }
2308
2309 while (!NILP (coding->character_unification_table_for_decode)
2310 && SYMBOLP (coding->character_unification_table_for_decode))
2311 coding->character_unification_table_for_decode
2312 = Fget (coding->character_unification_table_for_decode,
2313 Qcharacter_unification_table_for_decode);
2314 if (!NILP (coding->character_unification_table_for_decode)
2315 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2316 coding->character_unification_table_for_decode = Qnil;
2317
2318 while (!NILP (coding->character_unification_table_for_encode)
2319 && SYMBOLP (coding->character_unification_table_for_encode))
2320 coding->character_unification_table_for_encode
2321 = Fget (coding->character_unification_table_for_encode,
2322 Qcharacter_unification_table_for_encode);
2323 if (!NILP (coding->character_unification_table_for_encode)
2324 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2325 coding->character_unification_table_for_encode = Qnil;
2326
2327 if (!VECTORP (coding_system)
2328 || XVECTOR (coding_system)->size != 5)
2329 goto label_invalid_coding_system;
2330
2331 if (VECTORP (eol_type))
2332 coding->eol_type = CODING_EOL_UNDECIDED;
2333 else if (XFASTINT (eol_type) == 1)
2334 coding->eol_type = CODING_EOL_CRLF;
2335 else if (XFASTINT (eol_type) == 2)
2336 coding->eol_type = CODING_EOL_CR;
2337 else
2338 coding->eol_type = CODING_EOL_LF;
2339
2340 type = XVECTOR (coding_system)->contents[0];
2341 switch (XFASTINT (type))
2342 {
2343 case 0:
2344 coding->type = coding_type_emacs_mule;
2345 break;
2346
2347 case 1:
2348 coding->type = coding_type_sjis;
2349 break;
2350
2351 case 2:
2352 coding->type = coding_type_iso2022;
2353 {
2354 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2355 Lisp_Object *flags;
2356 int i, charset, default_reg_bits = 0;
2357
2358 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2359 goto label_invalid_coding_system;
2360
2361 flags = XVECTOR (val)->contents;
2362 coding->flags
2363 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2364 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2365 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2366 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2367 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2368 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2369 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2370 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2371 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2372 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2373 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2374 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2375 );
2376
2377 /* Invoke graphic register 0 to plane 0. */
2378 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2379 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2380 CODING_SPEC_ISO_INVOCATION (coding, 1)
2381 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2382 /* Not single shifting at first. */
2383 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2384 /* Beginning of buffer should also be regarded as bol. */
2385 CODING_SPEC_ISO_BOL(coding) = 1;
2386
2387 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2388 FLAGS[REG] can be one of below:
2389 integer CHARSET: CHARSET occupies register I,
2390 t: designate nothing to REG initially, but can be used
2391 by any charsets,
2392 list of integer, nil, or t: designate the first
2393 element (if integer) to REG initially, the remaining
2394 elements (if integer) is designated to REG on request,
2395 if an element is t, REG can be used by any charset,
2396 nil: REG is never used. */
2397 for (charset = 0; charset <= MAX_CHARSET; charset++)
2398 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2399 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2400 for (i = 0; i < 4; i++)
2401 {
2402 if (INTEGERP (flags[i])
2403 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2404 || (charset = get_charset_id (flags[i])) >= 0)
2405 {
2406 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2407 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2408 }
2409 else if (EQ (flags[i], Qt))
2410 {
2411 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2412 default_reg_bits |= 1 << i;
2413 }
2414 else if (CONSP (flags[i]))
2415 {
2416 Lisp_Object tail = flags[i];
2417
2418 if (INTEGERP (XCONS (tail)->car)
2419 && (charset = XINT (XCONS (tail)->car),
2420 CHARSET_VALID_P (charset))
2421 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2422 {
2423 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2424 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2425 }
2426 else
2427 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2428 tail = XCONS (tail)->cdr;
2429 while (CONSP (tail))
2430 {
2431 if (INTEGERP (XCONS (tail)->car)
2432 && (charset = XINT (XCONS (tail)->car),
2433 CHARSET_VALID_P (charset))
2434 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2435 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2436 = i;
2437 else if (EQ (XCONS (tail)->car, Qt))
2438 default_reg_bits |= 1 << i;
2439 tail = XCONS (tail)->cdr;
2440 }
2441 }
2442 else
2443 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2444
2445 CODING_SPEC_ISO_DESIGNATION (coding, i)
2446 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2447 }
2448
2449 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2450 {
2451 /* REG 1 can be used only by locking shift in 7-bit env. */
2452 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2453 default_reg_bits &= ~2;
2454 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2455 /* Without any shifting, only REG 0 and 1 can be used. */
2456 default_reg_bits &= 3;
2457 }
2458
2459 if (! (coding->flags & CODING_FLAG_ISO_SAFE))
2460 for (charset = 0; charset <= MAX_CHARSET; charset++)
2461 if (CHARSET_VALID_P (charset)
2462 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2463 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2464 {
2465 /* We have not yet decided where to designate CHARSET. */
2466 int reg_bits = default_reg_bits;
2467
2468 if (CHARSET_CHARS (charset) == 96)
2469 /* A charset of CHARS96 can't be designated to REG 0. */
2470 reg_bits &= ~1;
2471
2472 if (reg_bits)
2473 /* There exist some default graphic register. */
2474 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2475 = (reg_bits & 1
2476 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2477 else
2478 /* We anyway have to designate CHARSET to somewhere. */
2479 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2480 = (CHARSET_CHARS (charset) == 94
2481 ? 0
2482 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2483 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2484 ? 1
2485 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2486 ? 2 : 0)));
2487 }
2488 }
2489 coding->require_flushing = 1;
2490 break;
2491
2492 case 3:
2493 coding->type = coding_type_big5;
2494 coding->flags
2495 = (NILP (XVECTOR (coding_system)->contents[4])
2496 ? CODING_FLAG_BIG5_HKU
2497 : CODING_FLAG_BIG5_ETEN);
2498 break;
2499
2500 case 4:
2501 coding->type = coding_type_ccl;
2502 {
2503 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2504 if (CONSP (val)
2505 && VECTORP (XCONS (val)->car)
2506 && VECTORP (XCONS (val)->cdr))
2507 {
2508 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2509 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2510 }
2511 else
2512 goto label_invalid_coding_system;
2513 }
2514 coding->require_flushing = 1;
2515 break;
2516
2517 default:
2518 if (EQ (type, Qt))
2519 coding->type = coding_type_undecided;
2520 else
2521 coding->type = coding_type_no_conversion;
2522 break;
2523 }
2524 return 0;
2525
2526 label_invalid_coding_system:
2527 coding->type = coding_type_no_conversion;
2528 coding->eol_type = CODING_EOL_LF;
2529 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2530 = Qnil;
2531 return -1;
2532 }
2533
2534 /* Emacs has a mechanism to automatically detect a coding system if it
2535 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2536 it's impossible to distinguish some coding systems accurately
2537 because they use the same range of codes. So, at first, coding
2538 systems are categorized into 7, those are:
2539
2540 o coding-category-emacs-mule
2541
2542 The category for a coding system which has the same code range
2543 as Emacs' internal format. Assigned the coding-system (Lisp
2544 symbol) `emacs-mule' by default.
2545
2546 o coding-category-sjis
2547
2548 The category for a coding system which has the same code range
2549 as SJIS. Assigned the coding-system (Lisp
2550 symbol) `japanese-shift-jis' by default.
2551
2552 o coding-category-iso-7
2553
2554 The category for a coding system which has the same code range
2555 as ISO2022 of 7-bit environment. This doesn't use any locking
2556 shift and single shift functions. Assigned the coding-system
2557 (Lisp symbol) `iso-2022-7bit' by default.
2558
2559 o coding-category-iso-8-1
2560
2561 The category for a coding system which has the same code range
2562 as ISO2022 of 8-bit environment and graphic plane 1 used only
2563 for DIMENSION1 charset. This doesn't use any locking shift
2564 and single shift functions. Assigned the coding-system (Lisp
2565 symbol) `iso-latin-1' by default.
2566
2567 o coding-category-iso-8-2
2568
2569 The category for a coding system which has the same code range
2570 as ISO2022 of 8-bit environment and graphic plane 1 used only
2571 for DIMENSION2 charset. This doesn't use any locking shift
2572 and single shift functions. Assigned the coding-system (Lisp
2573 symbol) `japanese-iso-8bit' by default.
2574
2575 o coding-category-iso-7-else
2576
2577 The category for a coding system which has the same code range
2578 as ISO2022 of 7-bit environemnt but uses locking shift or
2579 single shift functions. Assigned the coding-system (Lisp
2580 symbol) `iso-2022-7bit-lock' by default.
2581
2582 o coding-category-iso-8-else
2583
2584 The category for a coding system which has the same code range
2585 as ISO2022 of 8-bit environemnt but uses locking shift or
2586 single shift functions. Assigned the coding-system (Lisp
2587 symbol) `iso-2022-8bit-ss2' by default.
2588
2589 o coding-category-big5
2590
2591 The category for a coding system which has the same code range
2592 as BIG5. Assigned the coding-system (Lisp symbol)
2593 `cn-big5' by default.
2594
2595 o coding-category-binary
2596
2597 The category for a coding system not categorized in any of the
2598 above. Assigned the coding-system (Lisp symbol)
2599 `no-conversion' by default.
2600
2601 Each of them is a Lisp symbol and the value is an actual
2602 `coding-system's (this is also a Lisp symbol) assigned by a user.
2603 What Emacs does actually is to detect a category of coding system.
2604 Then, it uses a `coding-system' assigned to it. If Emacs can't
2605 decide only one possible category, it selects a category of the
2606 highest priority. Priorities of categories are also specified by a
2607 user in a Lisp variable `coding-category-list'.
2608
2609 */
2610
2611 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2612 If it detects possible coding systems, return an integer in which
2613 appropriate flag bits are set. Flag bits are defined by macros
2614 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2615
2616 int
2617 detect_coding_mask (src, src_bytes)
2618 unsigned char *src;
2619 int src_bytes;
2620 {
2621 register unsigned char c;
2622 unsigned char *src_end = src + src_bytes;
2623 int mask;
2624
2625 /* At first, skip all ASCII characters and control characters except
2626 for three ISO2022 specific control characters. */
2627 label_loop_detect_coding:
2628 while (src < src_end)
2629 {
2630 c = *src;
2631 if (c >= 0x80
2632 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2633 break;
2634 src++;
2635 }
2636
2637 if (src >= src_end)
2638 /* We found nothing other than ASCII. There's nothing to do. */
2639 return CODING_CATEGORY_MASK_ANY;
2640
2641 /* The text seems to be encoded in some multilingual coding system.
2642 Now, try to find in which coding system the text is encoded. */
2643 if (c < 0x80)
2644 {
2645 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2646 /* C is an ISO2022 specific control code of C0. */
2647 mask = detect_coding_iso2022 (src, src_end);
2648 src++;
2649 if (mask == CODING_CATEGORY_MASK_ANY)
2650 /* No valid ISO2022 code follows C. Try again. */
2651 goto label_loop_detect_coding;
2652 }
2653 else if (c < 0xA0)
2654 {
2655 /* If C is a special Microsoft code,
2656 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2657 or is an ISO2022 control-sequence-introducer (CSI),
2658 we should also consider the possibility of someof ISO2022 codings. */
2659 if ((VECTORP (Vmicrosoft_code_table)
2660 && !NILP (XVECTOR (Vmicrosoft_code_table)->contents[c]))
2661 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2662 || (c == ISO_CODE_CSI
2663 && (src < src_end
2664 && (*src == ']'
2665 || (src + 1 < src_end
2666 && src[1] == ']'
2667 && (*src == '0' || *src == '1' || *src == '2'))))))
2668 mask = (detect_coding_iso2022 (src, src_end)
2669 | detect_coding_sjis (src, src_end)
2670 | detect_coding_emacs_mule (src, src_end)
2671 | CODING_CATEGORY_MASK_BINARY);
2672
2673 else
2674 /* C is the first byte of SJIS character code, or a
2675 leading-code of Emacs. */
2676 mask = (detect_coding_sjis (src, src_end)
2677 | detect_coding_emacs_mule (src, src_end)
2678 | CODING_CATEGORY_MASK_BINARY);
2679 }
2680 else
2681 /* C is a character of ISO2022 in graphic plane right,
2682 or a SJIS's 1-byte character code (i.e. JISX0201),
2683 or the first byte of BIG5's 2-byte code. */
2684 mask = (detect_coding_iso2022 (src, src_end)
2685 | detect_coding_sjis (src, src_end)
2686 | detect_coding_big5 (src, src_end)
2687 | CODING_CATEGORY_MASK_BINARY);
2688
2689 return mask;
2690 }
2691
2692 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2693 The information of the detected coding system is set in CODING. */
2694
2695 void
2696 detect_coding (coding, src, src_bytes)
2697 struct coding_system *coding;
2698 unsigned char *src;
2699 int src_bytes;
2700 {
2701 int mask = detect_coding_mask (src, src_bytes);
2702 int idx;
2703
2704 if (mask == CODING_CATEGORY_MASK_ANY)
2705 /* We found nothing other than ASCII. There's nothing to do. */
2706 return;
2707
2708 if (!mask)
2709 /* The source text seems to be encoded in unknown coding system.
2710 Emacs regards the category of such a kind of coding system as
2711 `coding-category-binary'. We assume that a user has assigned
2712 an appropriate coding system for a `coding-category-binary'. */
2713 idx = CODING_CATEGORY_IDX_BINARY;
2714 else
2715 {
2716 /* We found some plausible coding systems. Let's use a coding
2717 system of the highest priority. */
2718 Lisp_Object val = Vcoding_category_list;
2719
2720 if (CONSP (val))
2721 while (!NILP (val))
2722 {
2723 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2724 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2725 break;
2726 val = XCONS (val)->cdr;
2727 }
2728 else
2729 val = Qnil;
2730
2731 if (NILP (val))
2732 {
2733 /* For unknown reason, `Vcoding_category_list' contains none
2734 of found categories. Let's use any of them. */
2735 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2736 if (mask & (1 << idx))
2737 break;
2738 }
2739 }
2740 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2741 }
2742
2743 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2744 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2745 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2746
2747 #define MAX_EOL_CHECK_COUNT 3
2748
2749 int
2750 detect_eol_type (src, src_bytes)
2751 unsigned char *src;
2752 int src_bytes;
2753 {
2754 unsigned char *src_end = src + src_bytes;
2755 unsigned char c;
2756 int total = 0; /* How many end-of-lines are found so far. */
2757 int eol_type = CODING_EOL_UNDECIDED;
2758 int this_eol_type;
2759
2760 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2761 {
2762 c = *src++;
2763 if (c == '\n' || c == '\r')
2764 {
2765 total++;
2766 if (c == '\n')
2767 this_eol_type = CODING_EOL_LF;
2768 else if (src >= src_end || *src != '\n')
2769 this_eol_type = CODING_EOL_CR;
2770 else
2771 this_eol_type = CODING_EOL_CRLF, src++;
2772
2773 if (eol_type == CODING_EOL_UNDECIDED)
2774 /* This is the first end-of-line. */
2775 eol_type = this_eol_type;
2776 else if (eol_type != this_eol_type)
2777 /* The found type is different from what found before.
2778 We had better not decode end-of-line. */
2779 return CODING_EOL_LF;
2780 }
2781 }
2782
2783 return eol_type;
2784 }
2785
2786 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2787 is encoded. If it detects an appropriate format of end-of-line, it
2788 sets the information in *CODING. */
2789
2790 void
2791 detect_eol (coding, src, src_bytes)
2792 struct coding_system *coding;
2793 unsigned char *src;
2794 int src_bytes;
2795 {
2796 Lisp_Object val;
2797 int eol_type = detect_eol_type (src, src_bytes);
2798
2799 if (eol_type == CODING_EOL_UNDECIDED)
2800 /* We found no end-of-line in the source text. */
2801 return;
2802
2803 val = Fget (coding->symbol, Qeol_type);
2804 if (VECTORP (val) && XVECTOR (val)->size == 3)
2805 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2806 }
2807
2808 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2809 decoding, it may detect coding system and format of end-of-line if
2810 those are not yet decided. */
2811
2812 int
2813 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2814 struct coding_system *coding;
2815 unsigned char *source, *destination;
2816 int src_bytes, dst_bytes;
2817 int *consumed;
2818 {
2819 int produced;
2820
2821 if (src_bytes <= 0)
2822 {
2823 *consumed = 0;
2824 return 0;
2825 }
2826
2827 if (coding->type == coding_type_undecided)
2828 detect_coding (coding, source, src_bytes);
2829
2830 if (coding->eol_type == CODING_EOL_UNDECIDED)
2831 detect_eol (coding, source, src_bytes);
2832
2833 coding->carryover_size = 0;
2834 switch (coding->type)
2835 {
2836 case coding_type_no_conversion:
2837 label_no_conversion:
2838 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2839 bcopy (source, destination, produced);
2840 *consumed = produced;
2841 break;
2842
2843 case coding_type_emacs_mule:
2844 case coding_type_undecided:
2845 if (coding->eol_type == CODING_EOL_LF
2846 || coding->eol_type == CODING_EOL_UNDECIDED)
2847 goto label_no_conversion;
2848 produced = decode_eol (coding, source, destination,
2849 src_bytes, dst_bytes, consumed);
2850 break;
2851
2852 case coding_type_sjis:
2853 produced = decode_coding_sjis_big5 (coding, source, destination,
2854 src_bytes, dst_bytes, consumed,
2855 1);
2856 break;
2857
2858 case coding_type_iso2022:
2859 produced = decode_coding_iso2022 (coding, source, destination,
2860 src_bytes, dst_bytes, consumed);
2861 break;
2862
2863 case coding_type_big5:
2864 produced = decode_coding_sjis_big5 (coding, source, destination,
2865 src_bytes, dst_bytes, consumed,
2866 0);
2867 break;
2868
2869 case coding_type_ccl:
2870 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2871 src_bytes, dst_bytes, consumed);
2872 break;
2873 }
2874
2875 return produced;
2876 }
2877
2878 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2879
2880 int
2881 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2882 struct coding_system *coding;
2883 unsigned char *source, *destination;
2884 int src_bytes, dst_bytes;
2885 int *consumed;
2886 {
2887 int produced;
2888
2889 switch (coding->type)
2890 {
2891 case coding_type_no_conversion:
2892 label_no_conversion:
2893 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2894 if (produced > 0)
2895 {
2896 bcopy (source, destination, produced);
2897 if (coding->selective)
2898 {
2899 unsigned char *p = destination, *pend = destination + produced;
2900 while (p < pend)
2901 if (*p++ == '\015') p[-1] = '\n';
2902 }
2903 }
2904 *consumed = produced;
2905 break;
2906
2907 case coding_type_emacs_mule:
2908 case coding_type_undecided:
2909 if (coding->eol_type == CODING_EOL_LF
2910 || coding->eol_type == CODING_EOL_UNDECIDED)
2911 goto label_no_conversion;
2912 produced = encode_eol (coding, source, destination,
2913 src_bytes, dst_bytes, consumed);
2914 break;
2915
2916 case coding_type_sjis:
2917 produced = encode_coding_sjis_big5 (coding, source, destination,
2918 src_bytes, dst_bytes, consumed,
2919 1);
2920 break;
2921
2922 case coding_type_iso2022:
2923 produced = encode_coding_iso2022 (coding, source, destination,
2924 src_bytes, dst_bytes, consumed);
2925 break;
2926
2927 case coding_type_big5:
2928 produced = encode_coding_sjis_big5 (coding, source, destination,
2929 src_bytes, dst_bytes, consumed,
2930 0);
2931 break;
2932
2933 case coding_type_ccl:
2934 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2935 src_bytes, dst_bytes, consumed);
2936 break;
2937 }
2938
2939 return produced;
2940 }
2941
2942 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2943
2944 /* Return maximum size (bytes) of a buffer enough for decoding
2945 SRC_BYTES of text encoded in CODING. */
2946
2947 int
2948 decoding_buffer_size (coding, src_bytes)
2949 struct coding_system *coding;
2950 int src_bytes;
2951 {
2952 int magnification;
2953
2954 if (coding->type == coding_type_iso2022)
2955 magnification = 3;
2956 else if (coding->type == coding_type_ccl)
2957 magnification = coding->spec.ccl.decoder.buf_magnification;
2958 else
2959 magnification = 2;
2960
2961 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2962 }
2963
2964 /* Return maximum size (bytes) of a buffer enough for encoding
2965 SRC_BYTES of text to CODING. */
2966
2967 int
2968 encoding_buffer_size (coding, src_bytes)
2969 struct coding_system *coding;
2970 int src_bytes;
2971 {
2972 int magnification;
2973
2974 if (coding->type == coding_type_ccl)
2975 magnification = coding->spec.ccl.encoder.buf_magnification;
2976 else
2977 magnification = 3;
2978
2979 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2980 }
2981
2982 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2983 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2984 #endif
2985
2986 char *conversion_buffer;
2987 int conversion_buffer_size;
2988
2989 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2990 or decoding. Sufficient memory is allocated automatically. If we
2991 run out of memory, return NULL. */
2992
2993 char *
2994 get_conversion_buffer (size)
2995 int size;
2996 {
2997 if (size > conversion_buffer_size)
2998 {
2999 char *buf;
3000 int real_size = conversion_buffer_size * 2;
3001
3002 while (real_size < size) real_size *= 2;
3003 buf = (char *) xmalloc (real_size);
3004 xfree (conversion_buffer);
3005 conversion_buffer = buf;
3006 conversion_buffer_size = real_size;
3007 }
3008 return conversion_buffer;
3009 }
3010
3011 \f
3012 #ifdef emacs
3013 /*** 7. Emacs Lisp library functions ***/
3014
3015 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3016 1, 1, 0,
3017 "Return coding-spec of CODING-SYSTEM.\n\
3018 If CODING-SYSTEM is not a valid coding-system, return nil.")
3019 (obj)
3020 Lisp_Object obj;
3021 {
3022 while (SYMBOLP (obj) && !NILP (obj))
3023 obj = Fget (obj, Qcoding_system);
3024 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3025 ? Qnil : obj);
3026 }
3027
3028 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3029 "Return t if OBJECT is nil or a coding-system.\n\
3030 See document of make-coding-system for coding-system object.")
3031 (obj)
3032 Lisp_Object obj;
3033 {
3034 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3035 }
3036
3037 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3038 Sread_non_nil_coding_system, 1, 1, 0,
3039 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3040 (prompt)
3041 Lisp_Object prompt;
3042 {
3043 Lisp_Object val;
3044 do
3045 {
3046 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3047 Qt, Qnil, Qnil, Qnil);
3048 }
3049 while (XSTRING (val)->size == 0);
3050 return (Fintern (val, Qnil));
3051 }
3052
3053 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3054 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3055 (prompt)
3056 Lisp_Object prompt;
3057 {
3058 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3059 Qt, Qnil, Qnil, Qnil);
3060 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3061 }
3062
3063 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3064 1, 1, 0,
3065 "Check validity of CODING-SYSTEM.\n\
3066 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3067 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3068 The value of property should be a vector of length 5.")
3069 (coding_system)
3070 Lisp_Object coding_system;
3071 {
3072 CHECK_SYMBOL (coding_system, 0);
3073 if (!NILP (Fcoding_system_p (coding_system)))
3074 return coding_system;
3075 while (1)
3076 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3077 }
3078
3079 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3080 2, 2, 0,
3081 "Detect coding system of the text in the region between START and END.\n\
3082 Return a list of possible coding systems ordered by priority.\n\
3083 If only ASCII characters are found, it returns `undecided'\n\
3084 or its subsidiary coding system according to a detected end-of-line format.")
3085 (b, e)
3086 Lisp_Object b, e;
3087 {
3088 int coding_mask, eol_type;
3089 Lisp_Object val;
3090 int beg, end;
3091
3092 validate_region (&b, &e);
3093 beg = XINT (b), end = XINT (e);
3094 if (beg < GPT && end >= GPT) move_gap (end);
3095
3096 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3097 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3098
3099 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3100 {
3101 val = intern ("undecided");
3102 if (eol_type != CODING_EOL_UNDECIDED)
3103 {
3104 Lisp_Object val2 = Fget (val, Qeol_type);
3105 if (VECTORP (val2))
3106 val = XVECTOR (val2)->contents[eol_type];
3107 }
3108 }
3109 else
3110 {
3111 Lisp_Object val2;
3112
3113 /* At first, gather possible coding-systems in VAL in a reverse
3114 order. */
3115 val = Qnil;
3116 for (val2 = Vcoding_category_list;
3117 !NILP (val2);
3118 val2 = XCONS (val2)->cdr)
3119 {
3120 int idx
3121 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3122 if (coding_mask & (1 << idx))
3123 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3124 }
3125
3126 /* Then, change the order of the list, while getting subsidiary
3127 coding-systems. */
3128 val2 = val;
3129 val = Qnil;
3130 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3131 {
3132 if (eol_type == CODING_EOL_UNDECIDED)
3133 val = Fcons (XCONS (val2)->car, val);
3134 else
3135 {
3136 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3137 if (VECTORP (val3))
3138 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3139 else
3140 val = Fcons (XCONS (val2)->car, val);
3141 }
3142 }
3143 }
3144
3145 return val;
3146 }
3147
3148 /* Scan text in the region between *BEGP and *ENDP, skip characters
3149 which we never have to encode to (iff ENCODEP is 1) or decode from
3150 coding system CODING at the head and tail, then set BEGP and ENDP
3151 to the addresses of start and end of the text we actually convert. */
3152
3153 void
3154 shrink_conversion_area (begp, endp, coding, encodep)
3155 unsigned char **begp, **endp;
3156 struct coding_system *coding;
3157 int encodep;
3158 {
3159 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3160
3161 if (coding->eol_type != CODING_EOL_LF
3162 && coding->eol_type != CODING_EOL_UNDECIDED)
3163 /* Since we anyway have to convert end-of-line format, it is not
3164 worth skipping at most 100 bytes or so. */
3165 return;
3166
3167 if (encodep) /* for encoding */
3168 {
3169 switch (coding->type)
3170 {
3171 case coding_type_no_conversion:
3172 case coding_type_emacs_mule:
3173 case coding_type_undecided:
3174 /* We need no conversion. */
3175 *begp = *endp;
3176 return;
3177 case coding_type_ccl:
3178 /* We can't skip any data. */
3179 return;
3180 case coding_type_iso2022:
3181 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3182 {
3183 unsigned char *bol = beg_addr;
3184 while (beg_addr < end_addr && *beg_addr < 0x80)
3185 {
3186 beg_addr++;
3187 if (*(beg_addr - 1) == '\n')
3188 bol = beg_addr;
3189 }
3190 beg_addr = bol;
3191 goto label_skip_tail;
3192 }
3193 /* fall down ... */
3194 default:
3195 /* We can skip all ASCII characters at the head and tail. */
3196 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3197 label_skip_tail:
3198 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3199 break;
3200 }
3201 }
3202 else /* for decoding */
3203 {
3204 switch (coding->type)
3205 {
3206 case coding_type_no_conversion:
3207 /* We need no conversion. */
3208 *begp = *endp;
3209 return;
3210 case coding_type_emacs_mule:
3211 if (coding->eol_type == CODING_EOL_LF)
3212 {
3213 /* We need no conversion. */
3214 *begp = *endp;
3215 return;
3216 }
3217 /* We can skip all but carriage-return. */
3218 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3219 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3220 break;
3221 case coding_type_sjis:
3222 case coding_type_big5:
3223 /* We can skip all ASCII characters at the head. */
3224 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3225 /* We can skip all ASCII characters at the tail except for
3226 the second byte of SJIS or BIG5 code. */
3227 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3228 if (end_addr != *endp)
3229 end_addr++;
3230 break;
3231 case coding_type_ccl:
3232 /* We can't skip any data. */
3233 return;
3234 default: /* i.e. case coding_type_iso2022: */
3235 {
3236 unsigned char c;
3237
3238 /* We can skip all ASCII characters except for a few
3239 control codes at the head. */
3240 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3241 && c != ISO_CODE_CR && c != ISO_CODE_SO
3242 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3243 beg_addr++;
3244 }
3245 break;
3246 }
3247 }
3248 *begp = beg_addr;
3249 *endp = end_addr;
3250 return;
3251 }
3252
3253 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3254 text between B and E. B and E are buffer position. */
3255
3256 Lisp_Object
3257 code_convert_region (b, e, coding, encodep)
3258 Lisp_Object b, e;
3259 struct coding_system *coding;
3260 int encodep;
3261 {
3262 int beg, end, len, consumed, produced;
3263 char *buf;
3264 unsigned char *begp, *endp;
3265 int pos = PT;
3266
3267 validate_region (&b, &e);
3268 beg = XINT (b), end = XINT (e);
3269 if (beg < GPT && end >= GPT)
3270 move_gap (end);
3271
3272 if (encodep && !NILP (coding->pre_write_conversion))
3273 {
3274 /* We must call a pre-conversion function which may put a new
3275 text to be converted in a new buffer. */
3276 struct buffer *old = current_buffer, *new;
3277
3278 TEMP_SET_PT (beg);
3279 call2 (coding->pre_write_conversion, b, e);
3280 if (old != current_buffer)
3281 {
3282 /* Replace the original text by the text just generated. */
3283 len = ZV - BEGV;
3284 new = current_buffer;
3285 set_buffer_internal (old);
3286 del_range (beg, end);
3287 insert_from_buffer (new, 1, len, 0);
3288 end = beg + len;
3289 }
3290 }
3291
3292 /* We may be able to shrink the conversion region. */
3293 begp = POS_ADDR (beg); endp = begp + (end - beg);
3294 shrink_conversion_area (&begp, &endp, coding, encodep);
3295
3296 if (begp == endp)
3297 /* We need no conversion. */
3298 len = end - beg;
3299 else
3300 {
3301 beg += begp - POS_ADDR (beg);
3302 end = beg + (endp - begp);
3303
3304 if (encodep)
3305 len = encoding_buffer_size (coding, end - beg);
3306 else
3307 len = decoding_buffer_size (coding, end - beg);
3308 buf = get_conversion_buffer (len);
3309
3310 coding->last_block = 1;
3311 produced = (encodep
3312 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3313 &consumed)
3314 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3315 &consumed));
3316
3317 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3318
3319 TEMP_SET_PT (beg);
3320 insert (buf, produced);
3321 del_range (PT, PT + end - beg);
3322 if (pos >= end)
3323 pos = PT + (pos - end);
3324 else if (pos > beg)
3325 pos = beg;
3326 TEMP_SET_PT (pos);
3327 }
3328
3329 if (!encodep && !NILP (coding->post_read_conversion))
3330 {
3331 /* We must call a post-conversion function which may alter
3332 the text just converted. */
3333 Lisp_Object insval;
3334
3335 beg = XINT (b);
3336 TEMP_SET_PT (beg);
3337 insval = call1 (coding->post_read_conversion, make_number (len));
3338 CHECK_NUMBER (insval, 0);
3339 len = XINT (insval);
3340 }
3341
3342 return make_number (len);
3343 }
3344
3345 Lisp_Object
3346 code_convert_string (str, coding, encodep, nocopy)
3347 Lisp_Object str, nocopy;
3348 struct coding_system *coding;
3349 int encodep;
3350 {
3351 int len, consumed, produced;
3352 char *buf;
3353 unsigned char *begp, *endp;
3354 int head_skip, tail_skip;
3355 struct gcpro gcpro1;
3356
3357 if (encodep && !NILP (coding->pre_write_conversion)
3358 || !encodep && !NILP (coding->post_read_conversion))
3359 {
3360 /* Since we have to call Lisp functions which assume target text
3361 is in a buffer, after setting a temporary buffer, call
3362 code_convert_region. */
3363 int count = specpdl_ptr - specpdl;
3364 int len = XSTRING (str)->size;
3365 Lisp_Object result;
3366 struct buffer *old = current_buffer;
3367
3368 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3369 temp_output_buffer_setup (" *code-converting-work*");
3370 set_buffer_internal (XBUFFER (Vstandard_output));
3371 insert_from_string (str, 0, len, 0);
3372 code_convert_region (make_number (BEGV), make_number (ZV),
3373 coding, encodep);
3374 result = make_buffer_string (BEGV, ZV, 0);
3375 set_buffer_internal (old);
3376 return unbind_to (count, result);
3377 }
3378
3379 /* We may be able to shrink the conversion region. */
3380 begp = XSTRING (str)->data;
3381 endp = begp + XSTRING (str)->size;
3382 shrink_conversion_area (&begp, &endp, coding, encodep);
3383
3384 if (begp == endp)
3385 /* We need no conversion. */
3386 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3387
3388 head_skip = begp - XSTRING (str)->data;
3389 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3390
3391 GCPRO1 (str);
3392
3393 if (encodep)
3394 len = encoding_buffer_size (coding, endp - begp);
3395 else
3396 len = decoding_buffer_size (coding, endp - begp);
3397 buf = get_conversion_buffer (len + head_skip + tail_skip);
3398
3399 bcopy (XSTRING (str)->data, buf, head_skip);
3400 coding->last_block = 1;
3401 produced = (encodep
3402 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3403 buf + head_skip, endp - begp, len, &consumed)
3404 : decode_coding (coding, XSTRING (str)->data + head_skip,
3405 buf + head_skip, endp - begp, len, &consumed));
3406 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3407 buf + head_skip + produced,
3408 tail_skip);
3409
3410 UNGCPRO;
3411
3412 return make_string (buf, head_skip + produced + tail_skip);
3413 }
3414
3415 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3416 3, 3, "r\nzCoding system: ",
3417 "Decode current region by specified coding system.\n\
3418 When called from a program, takes three arguments:\n\
3419 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3420 Return length of decoded text.")
3421 (b, e, coding_system)
3422 Lisp_Object b, e, coding_system;
3423 {
3424 struct coding_system coding;
3425
3426 CHECK_NUMBER_COERCE_MARKER (b, 0);
3427 CHECK_NUMBER_COERCE_MARKER (e, 1);
3428 CHECK_SYMBOL (coding_system, 2);
3429
3430 if (NILP (coding_system))
3431 return make_number (XFASTINT (e) - XFASTINT (b));
3432 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3433 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3434
3435 return code_convert_region (b, e, &coding, 0);
3436 }
3437
3438 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3439 3, 3, "r\nzCoding system: ",
3440 "Encode current region by specified coding system.\n\
3441 When called from a program, takes three arguments:\n\
3442 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3443 Return length of encoded text.")
3444 (b, e, coding_system)
3445 Lisp_Object b, e, coding_system;
3446 {
3447 struct coding_system coding;
3448
3449 CHECK_NUMBER_COERCE_MARKER (b, 0);
3450 CHECK_NUMBER_COERCE_MARKER (e, 1);
3451 CHECK_SYMBOL (coding_system, 2);
3452
3453 if (NILP (coding_system))
3454 return make_number (XFASTINT (e) - XFASTINT (b));
3455 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3456 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3457
3458 return code_convert_region (b, e, &coding, 1);
3459 }
3460
3461 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3462 2, 3, 0,
3463 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3464 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3465 of decoding.")
3466 (string, coding_system, nocopy)
3467 Lisp_Object string, coding_system, nocopy;
3468 {
3469 struct coding_system coding;
3470
3471 CHECK_STRING (string, 0);
3472 CHECK_SYMBOL (coding_system, 1);
3473
3474 if (NILP (coding_system))
3475 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3476 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3477 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3478
3479 return code_convert_string (string, &coding, 0, nocopy);
3480 }
3481
3482 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3483 2, 3, 0,
3484 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3485 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3486 of encoding.")
3487 (string, coding_system, nocopy)
3488 Lisp_Object string, coding_system, nocopy;
3489 {
3490 struct coding_system coding;
3491
3492 CHECK_STRING (string, 0);
3493 CHECK_SYMBOL (coding_system, 1);
3494
3495 if (NILP (coding_system))
3496 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3497 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3498 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3499
3500 return code_convert_string (string, &coding, 1, nocopy);
3501 }
3502
3503 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3504 "Decode a JISX0208 character of shift-jis encoding.\n\
3505 CODE is the character code in SJIS.\n\
3506 Return the corresponding character.")
3507 (code)
3508 Lisp_Object code;
3509 {
3510 unsigned char c1, c2, s1, s2;
3511 Lisp_Object val;
3512
3513 CHECK_NUMBER (code, 0);
3514 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3515 DECODE_SJIS (s1, s2, c1, c2);
3516 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3517 return val;
3518 }
3519
3520 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3521 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3522 Return the corresponding character code in SJIS.")
3523 (ch)
3524 Lisp_Object ch;
3525 {
3526 int charset, c1, c2, s1, s2;
3527 Lisp_Object val;
3528
3529 CHECK_NUMBER (ch, 0);
3530 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3531 if (charset == charset_jisx0208)
3532 {
3533 ENCODE_SJIS (c1, c2, s1, s2);
3534 XSETFASTINT (val, (s1 << 8) | s2);
3535 }
3536 else
3537 XSETFASTINT (val, 0);
3538 return val;
3539 }
3540
3541 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3542 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3543 CODE is the character code in BIG5.\n\
3544 Return the corresponding character.")
3545 (code)
3546 Lisp_Object code;
3547 {
3548 int charset;
3549 unsigned char b1, b2, c1, c2;
3550 Lisp_Object val;
3551
3552 CHECK_NUMBER (code, 0);
3553 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3554 DECODE_BIG5 (b1, b2, charset, c1, c2);
3555 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3556 return val;
3557 }
3558
3559 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3560 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3561 Return the corresponding character code in Big5.")
3562 (ch)
3563 Lisp_Object ch;
3564 {
3565 int charset, c1, c2, b1, b2;
3566 Lisp_Object val;
3567
3568 CHECK_NUMBER (ch, 0);
3569 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3570 if (charset == charset_big5_1 || charset == charset_big5_2)
3571 {
3572 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3573 XSETFASTINT (val, (b1 << 8) | b2);
3574 }
3575 else
3576 XSETFASTINT (val, 0);
3577 return val;
3578 }
3579
3580 DEFUN ("set-terminal-coding-system-internal",
3581 Fset_terminal_coding_system_internal,
3582 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3583 (coding_system)
3584 Lisp_Object coding_system;
3585 {
3586 CHECK_SYMBOL (coding_system, 0);
3587 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3588 return Qnil;
3589 }
3590
3591 DEFUN ("set-safe-terminal-coding-system-internal",
3592 Fset_safe_terminal_coding_system_internal,
3593 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3594 (coding_system)
3595 Lisp_Object coding_system;
3596 {
3597 CHECK_SYMBOL (coding_system, 0);
3598 setup_coding_system (Fcheck_coding_system (coding_system),
3599 &safe_terminal_coding);
3600 return Qnil;
3601 }
3602
3603 DEFUN ("terminal-coding-system",
3604 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3605 "Return coding-system of your terminal.")
3606 ()
3607 {
3608 return terminal_coding.symbol;
3609 }
3610
3611 DEFUN ("set-keyboard-coding-system-internal",
3612 Fset_keyboard_coding_system_internal,
3613 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3614 (coding_system)
3615 Lisp_Object coding_system;
3616 {
3617 CHECK_SYMBOL (coding_system, 0);
3618 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3619 return Qnil;
3620 }
3621
3622 DEFUN ("keyboard-coding-system",
3623 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3624 "Return coding-system of what is sent from terminal keyboard.")
3625 ()
3626 {
3627 return keyboard_coding.symbol;
3628 }
3629
3630 \f
3631 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3632 Sfind_operation_coding_system, 1, MANY, 0,
3633 "Choose a coding system for an operation based on the target name.\n\
3634 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3635 DECODING-SYSTEM is the coding system to use for decoding\n\
3636 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3637 for encoding (in case OPERATION does encoding).\n\
3638 \n\
3639 The first argument OPERATION specifies an I/O primitive:\n\
3640 For file I/O, `insert-file-contents' or `write-region'.\n\
3641 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3642 For network I/O, `open-network-stream'.\n\
3643 \n\
3644 The remaining arguments should be the same arguments that were passed\n\
3645 to the primitive. Depending on which primitive, one of those arguments\n\
3646 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3647 whichever argument specifies the file name is TARGET.\n\
3648 \n\
3649 TARGET has a meaning which depends on OPERATION:\n\
3650 For file I/O, TARGET is a file name.\n\
3651 For process I/O, TARGET is a process name.\n\
3652 For network I/O, TARGET is a service name or a port number\n\
3653 \n\
3654 This function looks up what specified for TARGET in,\n\
3655 `file-coding-system-alist', `process-coding-system-alist',\n\
3656 or `network-coding-system-alist' depending on OPERATION.\n\
3657 They may specify a coding system, a cons of coding systems,\n\
3658 or a function symbol to call.\n\
3659 In the last case, we call the function with one argument,\n\
3660 which is a list of all the arguments given to this function.")
3661 (nargs, args)
3662 int nargs;
3663 Lisp_Object *args;
3664 {
3665 Lisp_Object operation, target_idx, target, val;
3666 register Lisp_Object chain;
3667
3668 if (nargs < 2)
3669 error ("Too few arguments");
3670 operation = args[0];
3671 if (!SYMBOLP (operation)
3672 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3673 error ("Invalid first arguement");
3674 if (nargs < 1 + XINT (target_idx))
3675 error ("Too few arguments for operation: %s",
3676 XSYMBOL (operation)->name->data);
3677 target = args[XINT (target_idx) + 1];
3678 if (!(STRINGP (target)
3679 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3680 error ("Invalid %dth argument", XINT (target_idx) + 1);
3681
3682 chain = ((EQ (operation, Qinsert_file_contents)
3683 || EQ (operation, Qwrite_region))
3684 ? Vfile_coding_system_alist
3685 : (EQ (operation, Qopen_network_stream)
3686 ? Vnetwork_coding_system_alist
3687 : Vprocess_coding_system_alist));
3688 if (NILP (chain))
3689 return Qnil;
3690
3691 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3692 {
3693 Lisp_Object elt = XCONS (chain)->car;
3694
3695 if (CONSP (elt)
3696 && ((STRINGP (target)
3697 && STRINGP (XCONS (elt)->car)
3698 && fast_string_match (XCONS (elt)->car, target) >= 0)
3699 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3700 {
3701 val = XCONS (elt)->cdr;
3702 if (CONSP (val))
3703 return val;
3704 if (! SYMBOLP (val))
3705 return Qnil;
3706 if (! NILP (Fcoding_system_p (val)))
3707 return Fcons (val, val);
3708 if (!NILP (Ffboundp (val)))
3709 return call1 (val, Flist (nargs, args));
3710 return Qnil;
3711 }
3712 }
3713 return Qnil;
3714 }
3715
3716 #endif /* emacs */
3717
3718 \f
3719 /*** 8. Post-amble ***/
3720
3721 init_coding_once ()
3722 {
3723 int i;
3724
3725 /* Emacs' internal format specific initialize routine. */
3726 for (i = 0; i <= 0x20; i++)
3727 emacs_code_class[i] = EMACS_control_code;
3728 emacs_code_class[0x0A] = EMACS_linefeed_code;
3729 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3730 for (i = 0x21 ; i < 0x7F; i++)
3731 emacs_code_class[i] = EMACS_ascii_code;
3732 emacs_code_class[0x7F] = EMACS_control_code;
3733 emacs_code_class[0x80] = EMACS_leading_code_composition;
3734 for (i = 0x81; i < 0xFF; i++)
3735 emacs_code_class[i] = EMACS_invalid_code;
3736 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3737 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3738 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3739 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3740
3741 /* ISO2022 specific initialize routine. */
3742 for (i = 0; i < 0x20; i++)
3743 iso_code_class[i] = ISO_control_code;
3744 for (i = 0x21; i < 0x7F; i++)
3745 iso_code_class[i] = ISO_graphic_plane_0;
3746 for (i = 0x80; i < 0xA0; i++)
3747 iso_code_class[i] = ISO_control_code;
3748 for (i = 0xA1; i < 0xFF; i++)
3749 iso_code_class[i] = ISO_graphic_plane_1;
3750 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3751 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3752 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3753 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3754 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3755 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3756 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3757 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3758 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3759 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3760
3761 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3762 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3763
3764 setup_coding_system (Qnil, &keyboard_coding);
3765 setup_coding_system (Qnil, &terminal_coding);
3766 setup_coding_system (Qnil, &safe_terminal_coding);
3767
3768 #if defined (MSDOS) || defined (WINDOWSNT)
3769 system_eol_type = CODING_EOL_CRLF;
3770 #else
3771 system_eol_type = CODING_EOL_LF;
3772 #endif
3773 }
3774
3775 #ifdef emacs
3776
3777 syms_of_coding ()
3778 {
3779 Qtarget_idx = intern ("target-idx");
3780 staticpro (&Qtarget_idx);
3781
3782 /* Target FILENAME is the first argument. */
3783 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3784 /* Target FILENAME is the third argument. */
3785 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3786
3787 Qcall_process = intern ("call-process");
3788 staticpro (&Qcall_process);
3789 /* Target PROGRAM is the first argument. */
3790 Fput (Qcall_process, Qtarget_idx, make_number (0));
3791
3792 Qcall_process_region = intern ("call-process-region");
3793 staticpro (&Qcall_process_region);
3794 /* Target PROGRAM is the third argument. */
3795 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3796
3797 Qstart_process = intern ("start-process");
3798 staticpro (&Qstart_process);
3799 /* Target PROGRAM is the third argument. */
3800 Fput (Qstart_process, Qtarget_idx, make_number (2));
3801
3802 Qopen_network_stream = intern ("open-network-stream");
3803 staticpro (&Qopen_network_stream);
3804 /* Target SERVICE is the fourth argument. */
3805 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3806
3807 Qcoding_system = intern ("coding-system");
3808 staticpro (&Qcoding_system);
3809
3810 Qeol_type = intern ("eol-type");
3811 staticpro (&Qeol_type);
3812
3813 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3814 staticpro (&Qbuffer_file_coding_system);
3815
3816 Qpost_read_conversion = intern ("post-read-conversion");
3817 staticpro (&Qpost_read_conversion);
3818
3819 Qpre_write_conversion = intern ("pre-write-conversion");
3820 staticpro (&Qpre_write_conversion);
3821
3822 Qcoding_system_spec = intern ("coding-system-spec");
3823 staticpro (&Qcoding_system_spec);
3824
3825 Qcoding_system_p = intern ("coding-system-p");
3826 staticpro (&Qcoding_system_p);
3827
3828 Qcoding_system_error = intern ("coding-system-error");
3829 staticpro (&Qcoding_system_error);
3830
3831 Fput (Qcoding_system_error, Qerror_conditions,
3832 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3833 Fput (Qcoding_system_error, Qerror_message,
3834 build_string ("Invalid coding system"));
3835
3836 Qcoding_category_index = intern ("coding-category-index");
3837 staticpro (&Qcoding_category_index);
3838
3839 {
3840 int i;
3841 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3842 {
3843 coding_category_table[i] = intern (coding_category_name[i]);
3844 staticpro (&coding_category_table[i]);
3845 Fput (coding_category_table[i], Qcoding_category_index,
3846 make_number (i));
3847 }
3848 }
3849
3850 Qcharacter_unification_table = intern ("character-unification-table");
3851 staticpro (&Qcharacter_unification_table);
3852 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3853 make_number (0));
3854
3855 Qcharacter_unification_table_for_decode
3856 = intern ("character-unification-table-for-decode");
3857 staticpro (&Qcharacter_unification_table_for_decode);
3858
3859 Qcharacter_unification_table_for_encode
3860 = intern ("character-unification-table-for-encode");
3861 staticpro (&Qcharacter_unification_table_for_encode);
3862
3863 Qemacs_mule = intern ("emacs-mule");
3864 staticpro (&Qemacs_mule);
3865
3866 defsubr (&Scoding_system_spec);
3867 defsubr (&Scoding_system_p);
3868 defsubr (&Sread_coding_system);
3869 defsubr (&Sread_non_nil_coding_system);
3870 defsubr (&Scheck_coding_system);
3871 defsubr (&Sdetect_coding_region);
3872 defsubr (&Sdecode_coding_region);
3873 defsubr (&Sencode_coding_region);
3874 defsubr (&Sdecode_coding_string);
3875 defsubr (&Sencode_coding_string);
3876 defsubr (&Sdecode_sjis_char);
3877 defsubr (&Sencode_sjis_char);
3878 defsubr (&Sdecode_big5_char);
3879 defsubr (&Sencode_big5_char);
3880 defsubr (&Sset_terminal_coding_system_internal);
3881 defsubr (&Sset_safe_terminal_coding_system_internal);
3882 defsubr (&Sterminal_coding_system);
3883 defsubr (&Sset_keyboard_coding_system_internal);
3884 defsubr (&Skeyboard_coding_system);
3885 defsubr (&Sfind_operation_coding_system);
3886
3887 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3888 "List of coding-categories (symbols) ordered by priority.");
3889 {
3890 int i;
3891
3892 Vcoding_category_list = Qnil;
3893 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3894 Vcoding_category_list
3895 = Fcons (coding_category_table[i], Vcoding_category_list);
3896 }
3897
3898 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3899 "Specify the coding system for read operations.\n\
3900 It is useful to bind this variable with `let', but do not set it globally.\n\
3901 If the value is a coding system, it is used for decoding on read operation.\n\
3902 If not, an appropriate element is used from one of the coding system alists:\n\
3903 There are three such tables, `file-coding-system-alist',\n\
3904 `process-coding-system-alist', and `network-coding-system-alist'.");
3905 Vcoding_system_for_read = Qnil;
3906
3907 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3908 "Specify the coding system for write operations.\n\
3909 It is useful to bind this variable with `let', but do not set it globally.\n\
3910 If the value is a coding system, it is used for encoding on write operation.\n\
3911 If not, an appropriate element is used from one of the coding system alists:\n\
3912 There are three such tables, `file-coding-system-alist',\n\
3913 `process-coding-system-alist', and `network-coding-system-alist'.");
3914 Vcoding_system_for_write = Qnil;
3915
3916 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3917 "Coding system used in the latest file or process I/O.");
3918 Vlast_coding_system_used = Qnil;
3919
3920 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3921 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3922 inhibit_eol_conversion = 0;
3923
3924 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3925 "Alist to decide a coding system to use for a file I/O operation.\n\
3926 The format is ((PATTERN . VAL) ...),\n\
3927 where PATTERN is a regular expression matching a file name,\n\
3928 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3929 If VAL is a coding system, it is used for both decoding and encoding\n\
3930 the file contents.\n\
3931 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3932 and the cdr part is used for encoding.\n\
3933 If VAL is a function symbol, the function must return a coding system\n\
3934 or a cons of coding systems which are used as above.\n\
3935 \n\
3936 See also the function `find-operation-coding-system'.");
3937 Vfile_coding_system_alist = Qnil;
3938
3939 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3940 "Alist to decide a coding system to use for a process I/O operation.\n\
3941 The format is ((PATTERN . VAL) ...),\n\
3942 where PATTERN is a regular expression matching a program name,\n\
3943 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3944 If VAL is a coding system, it is used for both decoding what received\n\
3945 from the program and encoding what sent to the program.\n\
3946 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3947 and the cdr part is used for encoding.\n\
3948 If VAL is a function symbol, the function must return a coding system\n\
3949 or a cons of coding systems which are used as above.\n\
3950 \n\
3951 See also the function `find-operation-coding-system'.");
3952 Vprocess_coding_system_alist = Qnil;
3953
3954 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3955 "Alist to decide a coding system to use for a network I/O operation.\n\
3956 The format is ((PATTERN . VAL) ...),\n\
3957 where PATTERN is a regular expression matching a network service name\n\
3958 or is a port number to connect to,\n\
3959 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3960 If VAL is a coding system, it is used for both decoding what received\n\
3961 from the network stream and encoding what sent to the network stream.\n\
3962 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3963 and the cdr part is used for encoding.\n\
3964 If VAL is a function symbol, the function must return a coding system\n\
3965 or a cons of coding systems which are used as above.\n\
3966 \n\
3967 See also the function `find-operation-coding-system'.");
3968 Vnetwork_coding_system_alist = Qnil;
3969
3970 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3971 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3972 eol_mnemonic_unix = ':';
3973
3974 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3975 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3976 eol_mnemonic_dos = '\\';
3977
3978 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3979 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3980 eol_mnemonic_mac = '/';
3981
3982 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3983 "Mnemonic character indicating end-of-line format is not yet decided.");
3984 eol_mnemonic_undecided = ':';
3985
3986 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3987 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3988 Venable_character_unification = Qt;
3989
3990 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3991 &Vstandard_character_unification_table_for_decode,
3992 "Table for unifying characters when reading.");
3993 Vstandard_character_unification_table_for_decode = Qnil;
3994
3995 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3996 &Vstandard_character_unification_table_for_encode,
3997 "Table for unifying characters when writing.");
3998 Vstandard_character_unification_table_for_encode = Qnil;
3999
4000 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4001 "Alist of charsets vs revision numbers.\n\
4002 While encoding, if a charset (car part of an element) is found,\n\
4003 designate it with the escape sequence identifing revision (cdr part of the element).");
4004 Vcharset_revision_alist = Qnil;
4005
4006 DEFVAR_LISP ("default-process-coding-system",
4007 &Vdefault_process_coding_system,
4008 "Cons of coding systems used for process I/O by default.\n\
4009 The car part is used for decoding a process output,\n\
4010 the cdr part is used for encoding a text to be sent to a process.");
4011 Vdefault_process_coding_system = Qnil;
4012
4013 DEFVAR_LISP ("special-microsoft-code-table", &Vmicrosoft_code_table,
4014 "Table of special Microsoft codes in the range 128..159 (inclusive).\n\
4015 This is a vector of length 256.\n\
4016 If Nth element is non-nil, the existence of code N in a file\n\
4017 (or output of subprocess) doesn't prevent it to be detected as\n\
4018 a coding system of ISO 2022 variant (e.g. iso-latin-1) on reading a file\n\
4019 or reading output of a subprocess.\n\
4020 Only 128th through 159th elements has a meaning.");
4021 Vmicrosoft_code_table = Fmake_vector (make_number (256), Qnil);
4022 }
4023
4024 #endif /* emacs */