]> code.delx.au - gnu-emacs/blob - src/coding.c
(mail-extr-all-top-level-domains): More domains.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
69
70 4. Raw text
71
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
74
75 5. Other
76
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
82 Emacs represents a coding-system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding-system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
86
87 */
88
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
95
96 Since text characters encoding and end-of-line encoding are
97 independent, any coding system described above can take
98 any format of end-of-line. So, Emacs has information of format of
99 end-of-line in each coding-system. See section 6 for more details.
100
101 */
102
103 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
104
105 These functions check if a text between SRC and SRC_END is encoded
106 in the coding system category XXX. Each returns an integer value in
107 which appropriate flag bits for the category XXX is set. The flag
108 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
109 template of these functions. */
110 #if 0
111 int
112 detect_coding_emacs_mule (src, src_end)
113 unsigned char *src, *src_end;
114 {
115 ...
116 }
117 #endif
118
119 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
120
121 These functions decode SRC_BYTES length text at SOURCE encoded in
122 CODING to Emacs' internal format (emacs-mule). The resulting text
123 goes to a place pointed to by DESTINATION, the length of which should
124 not exceed DST_BYTES. The number of bytes actually processed is
125 returned as *CONSUMED. The return value is the length of the decoded
126 text. Below is a template of these functions. */
127 #if 0
128 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
129 struct coding_system *coding;
130 unsigned char *source, *destination;
131 int src_bytes, dst_bytes;
132 int *consumed;
133 {
134 ...
135 }
136 #endif
137
138 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
139
140 These functions encode SRC_BYTES length text at SOURCE of Emacs'
141 internal format (emacs-mule) to CODING. The resulting text goes to
142 a place pointed to by DESTINATION, the length of which should not
143 exceed DST_BYTES. The number of bytes actually processed is
144 returned as *CONSUMED. The return value is the length of the
145 encoded text. Below is a template of these functions. */
146 #if 0
147 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
148 struct coding_system *coding;
149 unsigned char *source, *destination;
150 int src_bytes, dst_bytes;
151 int *consumed;
152 {
153 ...
154 }
155 #endif
156
157 /*** COMMONLY USED MACROS ***/
158
159 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
160 THREE_MORE_BYTES safely get one, two, and three bytes from the
161 source text respectively. If there are not enough bytes in the
162 source, they jump to `label_end_of_loop'. The caller should set
163 variables `src' and `src_end' to appropriate areas in advance. */
164
165 #define ONE_MORE_BYTE(c1) \
166 do { \
167 if (src < src_end) \
168 c1 = *src++; \
169 else \
170 goto label_end_of_loop; \
171 } while (0)
172
173 #define TWO_MORE_BYTES(c1, c2) \
174 do { \
175 if (src + 1 < src_end) \
176 c1 = *src++, c2 = *src++; \
177 else \
178 goto label_end_of_loop; \
179 } while (0)
180
181 #define THREE_MORE_BYTES(c1, c2, c3) \
182 do { \
183 if (src + 2 < src_end) \
184 c1 = *src++, c2 = *src++, c3 = *src++; \
185 else \
186 goto label_end_of_loop; \
187 } while (0)
188
189 /* The following three macros DECODE_CHARACTER_ASCII,
190 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
191 the multi-byte form of a character of each class at the place
192 pointed by `dst'. The caller should set the variable `dst' to
193 point to an appropriate area and the variable `coding' to point to
194 the coding-system of the currently decoding text in advance. */
195
196 /* Decode one ASCII character C. */
197
198 #define DECODE_CHARACTER_ASCII(c) \
199 do { \
200 if (COMPOSING_P (coding->composing)) \
201 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
202 else \
203 *dst++ = (c); \
204 } while (0)
205
206 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
207 position-code is C. */
208
209 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
210 do { \
211 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
212 if (COMPOSING_P (coding->composing)) \
213 *dst++ = leading_code + 0x20; \
214 else \
215 *dst++ = leading_code; \
216 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
217 *dst++ = leading_code; \
218 *dst++ = (c) | 0x80; \
219 } while (0)
220
221 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
222 position-codes are C1 and C2. */
223
224 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
225 do { \
226 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
227 *dst++ = (c2) | 0x80; \
228 } while (0)
229
230 \f
231 /*** 1. Preamble ***/
232
233 #include <stdio.h>
234
235 #ifdef emacs
236
237 #include <config.h>
238 #include "lisp.h"
239 #include "buffer.h"
240 #include "charset.h"
241 #include "ccl.h"
242 #include "coding.h"
243 #include "window.h"
244
245 #else /* not emacs */
246
247 #include "mulelib.h"
248
249 #endif /* not emacs */
250
251 Lisp_Object Qcoding_system, Qeol_type;
252 Lisp_Object Qbuffer_file_coding_system;
253 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
254 Lisp_Object Qno_conversion, Qundecided;
255 Lisp_Object Qcoding_system_history;
256 Lisp_Object Qsafe_charsets;
257
258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
260 Lisp_Object Qstart_process, Qopen_network_stream;
261 Lisp_Object Qtarget_idx;
262
263 /* Mnemonic character of each format of end-of-line. */
264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
265 /* Mnemonic character to indicate format of end-of-line is not yet
266 decided. */
267 int eol_mnemonic_undecided;
268
269 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
271 int system_eol_type;
272
273 #ifdef emacs
274
275 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
276
277 Lisp_Object Qcoding_system_p, Qcoding_system_error;
278
279 /* Coding system emacs-mule is for converting only end-of-line format. */
280 Lisp_Object Qemacs_mule;
281
282 /* Coding-systems are handed between Emacs Lisp programs and C internal
283 routines by the following three variables. */
284 /* Coding-system for reading files and receiving data from process. */
285 Lisp_Object Vcoding_system_for_read;
286 /* Coding-system for writing files and sending data to process. */
287 Lisp_Object Vcoding_system_for_write;
288 /* Coding-system actually used in the latest I/O. */
289 Lisp_Object Vlast_coding_system_used;
290
291 /* A vector of length 256 which contains information about special
292 Latin codes (espepcially for dealing with Microsoft code). */
293 Lisp_Object Vlatin_extra_code_table;
294
295 /* Flag to inhibit code conversion of end-of-line format. */
296 int inhibit_eol_conversion;
297
298 /* Coding system to be used to encode text for terminal display. */
299 struct coding_system terminal_coding;
300
301 /* Coding system to be used to encode text for terminal display when
302 terminal coding system is nil. */
303 struct coding_system safe_terminal_coding;
304
305 /* Coding system of what is sent from terminal keyboard. */
306 struct coding_system keyboard_coding;
307
308 Lisp_Object Vfile_coding_system_alist;
309 Lisp_Object Vprocess_coding_system_alist;
310 Lisp_Object Vnetwork_coding_system_alist;
311
312 #endif /* emacs */
313
314 Lisp_Object Qcoding_category_index;
315
316 /* List of symbols `coding-category-xxx' ordered by priority. */
317 Lisp_Object Vcoding_category_list;
318
319 /* Table of coding-systems currently assigned to each coding-category. */
320 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
321
322 /* Table of names of symbol for each coding-category. */
323 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
324 "coding-category-emacs-mule",
325 "coding-category-sjis",
326 "coding-category-iso-7",
327 "coding-category-iso-8-1",
328 "coding-category-iso-8-2",
329 "coding-category-iso-7-else",
330 "coding-category-iso-8-else",
331 "coding-category-big5",
332 "coding-category-raw-text",
333 "coding-category-binary"
334 };
335
336 /* Flag to tell if we look up unification table on character code
337 conversion. */
338 Lisp_Object Venable_character_unification;
339 /* Standard unification table to look up on decoding (reading). */
340 Lisp_Object Vstandard_character_unification_table_for_decode;
341 /* Standard unification table to look up on encoding (writing). */
342 Lisp_Object Vstandard_character_unification_table_for_encode;
343
344 Lisp_Object Qcharacter_unification_table;
345 Lisp_Object Qcharacter_unification_table_for_decode;
346 Lisp_Object Qcharacter_unification_table_for_encode;
347
348 /* Alist of charsets vs revision number. */
349 Lisp_Object Vcharset_revision_alist;
350
351 /* Default coding systems used for process I/O. */
352 Lisp_Object Vdefault_process_coding_system;
353
354 \f
355 /*** 2. Emacs internal format (emacs-mule) handlers ***/
356
357 /* Emacs' internal format for encoding multiple character sets is a
358 kind of multi-byte encoding, i.e. characters are encoded by
359 variable-length sequences of one-byte codes. ASCII characters
360 and control characters (e.g. `tab', `newline') are represented by
361 one-byte sequences which are their ASCII codes, in the range 0x00
362 through 0x7F. The other characters are represented by a sequence
363 of `base leading-code', optional `extended leading-code', and one
364 or two `position-code's. The length of the sequence is determined
365 by the base leading-code. Leading-code takes the range 0x80
366 through 0x9F, whereas extended leading-code and position-code take
367 the range 0xA0 through 0xFF. See `charset.h' for more details
368 about leading-code and position-code.
369
370 There's one exception to this rule. Special leading-code
371 `leading-code-composition' denotes that the following several
372 characters should be composed into one character. Leading-codes of
373 components (except for ASCII) are added 0x20. An ASCII character
374 component is represented by a 2-byte sequence of `0xA0' and
375 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
376 details of composite character. Hence, we can summarize the code
377 range as follows:
378
379 --- CODE RANGE of Emacs' internal format ---
380 (character set) (range)
381 ASCII 0x00 .. 0x7F
382 ELSE (1st byte) 0x80 .. 0x9F
383 (rest bytes) 0xA0 .. 0xFF
384 ---------------------------------------------
385
386 */
387
388 enum emacs_code_class_type emacs_code_class[256];
389
390 /* Go to the next statement only if *SRC is accessible and the code is
391 greater than 0xA0. */
392 #define CHECK_CODE_RANGE_A0_FF \
393 do { \
394 if (src >= src_end) \
395 goto label_end_of_switch; \
396 else if (*src++ < 0xA0) \
397 return 0; \
398 } while (0)
399
400 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
401 Check if a text is encoded in Emacs' internal format. If it is,
402 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
403
404 int
405 detect_coding_emacs_mule (src, src_end)
406 unsigned char *src, *src_end;
407 {
408 unsigned char c;
409 int composing = 0;
410
411 while (src < src_end)
412 {
413 c = *src++;
414
415 if (composing)
416 {
417 if (c < 0xA0)
418 composing = 0;
419 else
420 c -= 0x20;
421 }
422
423 switch (emacs_code_class[c])
424 {
425 case EMACS_ascii_code:
426 case EMACS_linefeed_code:
427 break;
428
429 case EMACS_control_code:
430 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
431 return 0;
432 break;
433
434 case EMACS_invalid_code:
435 return 0;
436
437 case EMACS_leading_code_composition: /* c == 0x80 */
438 if (composing)
439 CHECK_CODE_RANGE_A0_FF;
440 else
441 composing = 1;
442 break;
443
444 case EMACS_leading_code_4:
445 CHECK_CODE_RANGE_A0_FF;
446 /* fall down to check it two more times ... */
447
448 case EMACS_leading_code_3:
449 CHECK_CODE_RANGE_A0_FF;
450 /* fall down to check it one more time ... */
451
452 case EMACS_leading_code_2:
453 CHECK_CODE_RANGE_A0_FF;
454 break;
455
456 default:
457 label_end_of_switch:
458 break;
459 }
460 }
461 return CODING_CATEGORY_MASK_EMACS_MULE;
462 }
463
464 \f
465 /*** 3. ISO2022 handlers ***/
466
467 /* The following note describes the coding system ISO2022 briefly.
468 Since the intention of this note is to help in understanding of
469 the programs in this file, some parts are NOT ACCURATE or OVERLY
470 SIMPLIFIED. For the thorough understanding, please refer to the
471 original document of ISO2022.
472
473 ISO2022 provides many mechanisms to encode several character sets
474 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
475 all text is encoded by codes of less than 128. This may make the
476 encoded text a little bit longer, but the text gets more stability
477 to pass through several gateways (some of them strip off the MSB).
478
479 There are two kinds of character set: control character set and
480 graphic character set. The former contains control characters such
481 as `newline' and `escape' to provide control functions (control
482 functions are provided also by escape sequences). The latter
483 contains graphic characters such as ' A' and '-'. Emacs recognizes
484 two control character sets and many graphic character sets.
485
486 Graphic character sets are classified into one of the following
487 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
488 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
489 bytes (DIMENSION) and the number of characters in one dimension
490 (CHARS) of the set. In addition, each character set is assigned an
491 identification tag (called "final character" and denoted as <F>
492 here after) which is unique in each class. <F> of each character
493 set is decided by ECMA(*) when it is registered in ISO. Code range
494 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
495
496 Note (*): ECMA = European Computer Manufacturers Association
497
498 Here are examples of graphic character set [NAME(<F>)]:
499 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
500 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
501 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
502 o DIMENSION2_CHARS96 -- none for the moment
503
504 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
505 C0 [0x00..0x1F] -- control character plane 0
506 GL [0x20..0x7F] -- graphic character plane 0
507 C1 [0x80..0x9F] -- control character plane 1
508 GR [0xA0..0xFF] -- graphic character plane 1
509
510 A control character set is directly designated and invoked to C0 or
511 C1 by an escape sequence. The most common case is that ISO646's
512 control character set is designated/invoked to C0 and ISO6429's
513 control character set is designated/invoked to C1, and usually
514 these designations/invocations are omitted in a coded text. With
515 7-bit environment, only C0 can be used, and a control character for
516 C1 is encoded by an appropriate escape sequence to fit in the
517 environment. All control characters for C1 are defined the
518 corresponding escape sequences.
519
520 A graphic character set is at first designated to one of four
521 graphic registers (G0 through G3), then these graphic registers are
522 invoked to GL or GR. These designations and invocations can be
523 done independently. The most common case is that G0 is invoked to
524 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
525 these invocations and designations are omitted in a coded text.
526 With 7-bit environment, only GL can be used.
527
528 When a graphic character set of CHARS94 is invoked to GL, code 0x20
529 and 0x7F of GL area work as control characters SPACE and DEL
530 respectively, and code 0xA0 and 0xFF of GR area should not be used.
531
532 There are two ways of invocation: locking-shift and single-shift.
533 With locking-shift, the invocation lasts until the next different
534 invocation, whereas with single-shift, the invocation works only
535 for the following character and doesn't affect locking-shift.
536 Invocations are done by the following control characters or escape
537 sequences.
538
539 ----------------------------------------------------------------------
540 function control char escape sequence description
541 ----------------------------------------------------------------------
542 SI (shift-in) 0x0F none invoke G0 to GL
543 SO (shift-out) 0x0E none invoke G1 to GL
544 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
545 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
546 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
547 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
548 ----------------------------------------------------------------------
549 The first four are for locking-shift. Control characters for these
550 functions are defined by macros ISO_CODE_XXX in `coding.h'.
551
552 Designations are done by the following escape sequences.
553 ----------------------------------------------------------------------
554 escape sequence description
555 ----------------------------------------------------------------------
556 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
557 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
558 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
559 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
560 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
561 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
562 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
563 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
564 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
565 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
566 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
567 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
568 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
569 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
570 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
571 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
572 ----------------------------------------------------------------------
573
574 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
575 of dimension 1, chars 94, and final character <F>, and etc.
576
577 Note (*): Although these designations are not allowed in ISO2022,
578 Emacs accepts them on decoding, and produces them on encoding
579 CHARS96 character set in a coding system which is characterized as
580 7-bit environment, non-locking-shift, and non-single-shift.
581
582 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
583 '(' can be omitted. We call this as "short-form" here after.
584
585 Now you may notice that there are a lot of ways for encoding the
586 same multilingual text in ISO2022. Actually, there exists many
587 coding systems such as Compound Text (used in X's inter client
588 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
589 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
590 localized platforms), and all of these are variants of ISO2022.
591
592 In addition to the above, Emacs handles two more kinds of escape
593 sequences: ISO6429's direction specification and Emacs' private
594 sequence for specifying character composition.
595
596 ISO6429's direction specification takes the following format:
597 o CSI ']' -- end of the current direction
598 o CSI '0' ']' -- end of the current direction
599 o CSI '1' ']' -- start of left-to-right text
600 o CSI '2' ']' -- start of right-to-left text
601 The control character CSI (0x9B: control sequence introducer) is
602 abbreviated to the escape sequence ESC '[' in 7-bit environment.
603
604 Character composition specification takes the following format:
605 o ESC '0' -- start character composition
606 o ESC '1' -- end character composition
607 Since these are not standard escape sequences of any ISO, the use
608 of them for these meaning is restricted to Emacs only. */
609
610 enum iso_code_class_type iso_code_class[256];
611
612 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
613 Check if a text is encoded in ISO2022. If it is, returns an
614 integer in which appropriate flag bits any of:
615 CODING_CATEGORY_MASK_ISO_7
616 CODING_CATEGORY_MASK_ISO_8_1
617 CODING_CATEGORY_MASK_ISO_8_2
618 CODING_CATEGORY_MASK_ISO_7_ELSE
619 CODING_CATEGORY_MASK_ISO_8_ELSE
620 are set. If a code which should never appear in ISO2022 is found,
621 returns 0. */
622
623 int
624 detect_coding_iso2022 (src, src_end)
625 unsigned char *src, *src_end;
626 {
627 int mask = (CODING_CATEGORY_MASK_ISO_7
628 | CODING_CATEGORY_MASK_ISO_8_1
629 | CODING_CATEGORY_MASK_ISO_8_2
630 | CODING_CATEGORY_MASK_ISO_7_ELSE
631 | CODING_CATEGORY_MASK_ISO_8_ELSE
632 );
633 int g1 = 0; /* 1 iff designating to G1. */
634 int c, i;
635 struct coding_system coding_iso_8_1, coding_iso_8_2;
636
637 /* Coding systems of these categories may accept latin extra codes. */
638 setup_coding_system
639 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
640 &coding_iso_8_1);
641 setup_coding_system
642 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
643 &coding_iso_8_2);
644
645 while (mask && src < src_end)
646 {
647 c = *src++;
648 switch (c)
649 {
650 case ISO_CODE_ESC:
651 if (src >= src_end)
652 break;
653 c = *src++;
654 if ((c >= '(' && c <= '/'))
655 {
656 /* Designation sequence for a charset of dimension 1. */
657 if (src >= src_end)
658 break;
659 c = *src++;
660 if (c < ' ' || c >= 0x80)
661 /* Invalid designation sequence. */
662 return 0;
663 }
664 else if (c == '$')
665 {
666 /* Designation sequence for a charset of dimension 2. */
667 if (src >= src_end)
668 break;
669 c = *src++;
670 if (c >= '@' && c <= 'B')
671 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
672 ;
673 else if (c >= '(' && c <= '/')
674 {
675 if (src >= src_end)
676 break;
677 c = *src++;
678 if (c < ' ' || c >= 0x80)
679 /* Invalid designation sequence. */
680 return 0;
681 }
682 else
683 /* Invalid designation sequence. */
684 return 0;
685 }
686 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
687 /* Locking shift. */
688 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
689 | CODING_CATEGORY_MASK_ISO_8_ELSE);
690 else if (c == '0' || c == '1' || c == '2')
691 /* Start/end composition. */
692 ;
693 else
694 /* Invalid escape sequence. */
695 return 0;
696 break;
697
698 case ISO_CODE_SO:
699 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
700 | CODING_CATEGORY_MASK_ISO_8_ELSE);
701 break;
702
703 case ISO_CODE_CSI:
704 case ISO_CODE_SS2:
705 case ISO_CODE_SS3:
706 {
707 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
708
709 if (c != ISO_CODE_CSI)
710 {
711 if (coding_iso_8_1.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
712 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
713 if (coding_iso_8_2.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
714 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
715 }
716 if (VECTORP (Vlatin_extra_code_table)
717 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
718 {
719 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
720 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
721 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
722 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
723 }
724 mask &= newmask;
725 }
726 break;
727
728 default:
729 if (c < 0x80)
730 break;
731 else if (c < 0xA0)
732 {
733 if (VECTORP (Vlatin_extra_code_table)
734 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
735 {
736 int newmask = 0;
737
738 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
739 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
740 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
741 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
742 mask &= newmask;
743 }
744 else
745 return 0;
746 }
747 else
748 {
749 unsigned char *src_begin = src;
750
751 mask &= ~(CODING_CATEGORY_MASK_ISO_7
752 | CODING_CATEGORY_MASK_ISO_7_ELSE);
753 while (src < src_end && *src >= 0xA0)
754 src++;
755 if ((src - src_begin - 1) & 1 && src < src_end)
756 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
757 }
758 break;
759 }
760 }
761
762 return mask;
763 }
764
765 /* Decode a character of which charset is CHARSET and the 1st position
766 code is C1. If dimension of CHARSET is 2, the 2nd position code is
767 fetched from SRC and set to C2. If CHARSET is negative, it means
768 that we are decoding ill formed text, and what we can do is just to
769 read C1 as is. */
770
771 #define DECODE_ISO_CHARACTER(charset, c1) \
772 do { \
773 int c_alt, charset_alt = (charset); \
774 if (COMPOSING_HEAD_P (coding->composing)) \
775 { \
776 *dst++ = LEADING_CODE_COMPOSITION; \
777 if (COMPOSING_WITH_RULE_P (coding->composing)) \
778 /* To tell composition rules are embeded. */ \
779 *dst++ = 0xFF; \
780 coding->composing += 2; \
781 } \
782 if ((charset) >= 0) \
783 { \
784 if (CHARSET_DIMENSION (charset) == 2) \
785 { \
786 ONE_MORE_BYTE (c2); \
787 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
788 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
789 { \
790 src--; \
791 c2 = ' '; \
792 } \
793 } \
794 if (!NILP (unification_table) \
795 && ((c_alt = unify_char (unification_table, \
796 -1, (charset), c1, c2)) >= 0)) \
797 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
798 } \
799 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
800 DECODE_CHARACTER_ASCII (c1); \
801 else if (CHARSET_DIMENSION (charset_alt) == 1) \
802 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
803 else \
804 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
805 if (COMPOSING_WITH_RULE_P (coding->composing)) \
806 /* To tell a composition rule follows. */ \
807 coding->composing = COMPOSING_WITH_RULE_RULE; \
808 } while (0)
809
810 /* Set designation state into CODING. */
811 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
812 do { \
813 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
814 make_number (chars), \
815 make_number (final_char)); \
816 if (charset >= 0) \
817 { \
818 if (coding->direction == 1 \
819 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
820 charset = CHARSET_REVERSE_CHARSET (charset); \
821 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
822 } \
823 } while (0)
824
825 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
826
827 int
828 decode_coding_iso2022 (coding, source, destination,
829 src_bytes, dst_bytes, consumed)
830 struct coding_system *coding;
831 unsigned char *source, *destination;
832 int src_bytes, dst_bytes;
833 int *consumed;
834 {
835 unsigned char *src = source;
836 unsigned char *src_end = source + src_bytes;
837 unsigned char *dst = destination;
838 unsigned char *dst_end = destination + dst_bytes;
839 /* Since the maximum bytes produced by each loop is 7, we subtract 6
840 from DST_END to assure that overflow checking is necessary only
841 at the head of loop. */
842 unsigned char *adjusted_dst_end = dst_end - 6;
843 int charset;
844 /* Charsets invoked to graphic plane 0 and 1 respectively. */
845 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
846 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
847 Lisp_Object unification_table
848 = coding->character_unification_table_for_decode;
849
850 if (!NILP (Venable_character_unification) && NILP (unification_table))
851 unification_table = Vstandard_character_unification_table_for_decode;
852
853 while (src < src_end && dst < adjusted_dst_end)
854 {
855 /* SRC_BASE remembers the start position in source in each loop.
856 The loop will be exited when there's not enough source text
857 to analyze long escape sequence or 2-byte code (within macros
858 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
859 to SRC_BASE before exiting. */
860 unsigned char *src_base = src;
861 int c1 = *src++, c2;
862
863 switch (iso_code_class [c1])
864 {
865 case ISO_0x20_or_0x7F:
866 if (!coding->composing
867 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
868 {
869 /* This is SPACE or DEL. */
870 *dst++ = c1;
871 break;
872 }
873 /* This is a graphic character, we fall down ... */
874
875 case ISO_graphic_plane_0:
876 if (coding->composing == COMPOSING_WITH_RULE_RULE)
877 {
878 /* This is a composition rule. */
879 *dst++ = c1 | 0x80;
880 coding->composing = COMPOSING_WITH_RULE_TAIL;
881 }
882 else
883 DECODE_ISO_CHARACTER (charset0, c1);
884 break;
885
886 case ISO_0xA0_or_0xFF:
887 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
888 {
889 /* Invalid code. */
890 *dst++ = c1;
891 break;
892 }
893 /* This is a graphic character, we fall down ... */
894
895 case ISO_graphic_plane_1:
896 DECODE_ISO_CHARACTER (charset1, c1);
897 break;
898
899 case ISO_control_code:
900 /* All ISO2022 control characters in this class have the
901 same representation in Emacs internal format. */
902 *dst++ = c1;
903 break;
904
905 case ISO_carriage_return:
906 if (coding->eol_type == CODING_EOL_CR)
907 {
908 *dst++ = '\n';
909 }
910 else if (coding->eol_type == CODING_EOL_CRLF)
911 {
912 ONE_MORE_BYTE (c1);
913 if (c1 == ISO_CODE_LF)
914 *dst++ = '\n';
915 else
916 {
917 src--;
918 *dst++ = c1;
919 }
920 }
921 else
922 {
923 *dst++ = c1;
924 }
925 break;
926
927 case ISO_shift_out:
928 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
929 goto label_invalid_escape_sequence;
930 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
931 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
932 break;
933
934 case ISO_shift_in:
935 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
936 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
937 break;
938
939 case ISO_single_shift_2_7:
940 case ISO_single_shift_2:
941 /* SS2 is handled as an escape sequence of ESC 'N' */
942 c1 = 'N';
943 goto label_escape_sequence;
944
945 case ISO_single_shift_3:
946 /* SS2 is handled as an escape sequence of ESC 'O' */
947 c1 = 'O';
948 goto label_escape_sequence;
949
950 case ISO_control_sequence_introducer:
951 /* CSI is handled as an escape sequence of ESC '[' ... */
952 c1 = '[';
953 goto label_escape_sequence;
954
955 case ISO_escape:
956 ONE_MORE_BYTE (c1);
957 label_escape_sequence:
958 /* Escape sequences handled by Emacs are invocation,
959 designation, direction specification, and character
960 composition specification. */
961 switch (c1)
962 {
963 case '&': /* revision of following character set */
964 ONE_MORE_BYTE (c1);
965 if (!(c1 >= '@' && c1 <= '~'))
966 goto label_invalid_escape_sequence;
967 ONE_MORE_BYTE (c1);
968 if (c1 != ISO_CODE_ESC)
969 goto label_invalid_escape_sequence;
970 ONE_MORE_BYTE (c1);
971 goto label_escape_sequence;
972
973 case '$': /* designation of 2-byte character set */
974 ONE_MORE_BYTE (c1);
975 if (c1 >= '@' && c1 <= 'B')
976 { /* designation of JISX0208.1978, GB2312.1980,
977 or JISX0208.1980 */
978 DECODE_DESIGNATION (0, 2, 94, c1);
979 }
980 else if (c1 >= 0x28 && c1 <= 0x2B)
981 { /* designation of DIMENSION2_CHARS94 character set */
982 ONE_MORE_BYTE (c2);
983 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
984 }
985 else if (c1 >= 0x2C && c1 <= 0x2F)
986 { /* designation of DIMENSION2_CHARS96 character set */
987 ONE_MORE_BYTE (c2);
988 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
989 }
990 else
991 goto label_invalid_escape_sequence;
992 break;
993
994 case 'n': /* invocation of locking-shift-2 */
995 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
996 goto label_invalid_escape_sequence;
997 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
998 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
999 break;
1000
1001 case 'o': /* invocation of locking-shift-3 */
1002 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1003 goto label_invalid_escape_sequence;
1004 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1005 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1006 break;
1007
1008 case 'N': /* invocation of single-shift-2 */
1009 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1010 goto label_invalid_escape_sequence;
1011 ONE_MORE_BYTE (c1);
1012 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1013 DECODE_ISO_CHARACTER (charset, c1);
1014 break;
1015
1016 case 'O': /* invocation of single-shift-3 */
1017 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1018 goto label_invalid_escape_sequence;
1019 ONE_MORE_BYTE (c1);
1020 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1021 DECODE_ISO_CHARACTER (charset, c1);
1022 break;
1023
1024 case '0': /* start composing without embeded rules */
1025 coding->composing = COMPOSING_NO_RULE_HEAD;
1026 break;
1027
1028 case '1': /* end composing */
1029 coding->composing = COMPOSING_NO;
1030 break;
1031
1032 case '2': /* start composing with embeded rules */
1033 coding->composing = COMPOSING_WITH_RULE_HEAD;
1034 break;
1035
1036 case '[': /* specification of direction */
1037 /* For the moment, nested direction is not supported.
1038 So, the value of `coding->direction' is 0 or 1: 0
1039 means left-to-right, 1 means right-to-left. */
1040 ONE_MORE_BYTE (c1);
1041 switch (c1)
1042 {
1043 case ']': /* end of the current direction */
1044 coding->direction = 0;
1045
1046 case '0': /* end of the current direction */
1047 case '1': /* start of left-to-right direction */
1048 ONE_MORE_BYTE (c1);
1049 if (c1 == ']')
1050 coding->direction = 0;
1051 else
1052 goto label_invalid_escape_sequence;
1053 break;
1054
1055 case '2': /* start of right-to-left direction */
1056 ONE_MORE_BYTE (c1);
1057 if (c1 == ']')
1058 coding->direction= 1;
1059 else
1060 goto label_invalid_escape_sequence;
1061 break;
1062
1063 default:
1064 goto label_invalid_escape_sequence;
1065 }
1066 break;
1067
1068 default:
1069 if (c1 >= 0x28 && c1 <= 0x2B)
1070 { /* designation of DIMENSION1_CHARS94 character set */
1071 ONE_MORE_BYTE (c2);
1072 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1073 }
1074 else if (c1 >= 0x2C && c1 <= 0x2F)
1075 { /* designation of DIMENSION1_CHARS96 character set */
1076 ONE_MORE_BYTE (c2);
1077 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1078 }
1079 else
1080 {
1081 goto label_invalid_escape_sequence;
1082 }
1083 }
1084 /* We must update these variables now. */
1085 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1086 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1087 break;
1088
1089 label_invalid_escape_sequence:
1090 {
1091 int length = src - src_base;
1092
1093 bcopy (src_base, dst, length);
1094 dst += length;
1095 }
1096 }
1097 continue;
1098
1099 label_end_of_loop:
1100 coding->carryover_size = src - src_base;
1101 bcopy (src_base, coding->carryover, coding->carryover_size);
1102 src = src_base;
1103 break;
1104 }
1105
1106 /* If this is the last block of the text to be decoded, we had
1107 better just flush out all remaining codes in the text although
1108 they are not valid characters. */
1109 if (coding->last_block)
1110 {
1111 bcopy (src, dst, src_end - src);
1112 dst += (src_end - src);
1113 src = src_end;
1114 }
1115 *consumed = src - source;
1116 return dst - destination;
1117 }
1118
1119 /* ISO2022 encoding stuff. */
1120
1121 /*
1122 It is not enough to say just "ISO2022" on encoding, we have to
1123 specify more details. In Emacs, each coding-system of ISO2022
1124 variant has the following specifications:
1125 1. Initial designation to G0 thru G3.
1126 2. Allows short-form designation?
1127 3. ASCII should be designated to G0 before control characters?
1128 4. ASCII should be designated to G0 at end of line?
1129 5. 7-bit environment or 8-bit environment?
1130 6. Use locking-shift?
1131 7. Use Single-shift?
1132 And the following two are only for Japanese:
1133 8. Use ASCII in place of JIS0201-1976-Roman?
1134 9. Use JISX0208-1983 in place of JISX0208-1978?
1135 These specifications are encoded in `coding->flags' as flag bits
1136 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1137 details.
1138 */
1139
1140 /* Produce codes (escape sequence) for designating CHARSET to graphic
1141 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1142 the coding system CODING allows, produce designation sequence of
1143 short-form. */
1144
1145 #define ENCODE_DESIGNATION(charset, reg, coding) \
1146 do { \
1147 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1148 char *intermediate_char_94 = "()*+"; \
1149 char *intermediate_char_96 = ",-./"; \
1150 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1151 if (revision < 255) \
1152 { \
1153 *dst++ = ISO_CODE_ESC; \
1154 *dst++ = '&'; \
1155 *dst++ = '@' + revision; \
1156 } \
1157 *dst++ = ISO_CODE_ESC; \
1158 if (CHARSET_DIMENSION (charset) == 1) \
1159 { \
1160 if (CHARSET_CHARS (charset) == 94) \
1161 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1162 else \
1163 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1164 } \
1165 else \
1166 { \
1167 *dst++ = '$'; \
1168 if (CHARSET_CHARS (charset) == 94) \
1169 { \
1170 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1171 || reg != 0 \
1172 || final_char < '@' || final_char > 'B') \
1173 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1174 } \
1175 else \
1176 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1177 } \
1178 *dst++ = final_char; \
1179 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1180 } while (0)
1181
1182 /* The following two macros produce codes (control character or escape
1183 sequence) for ISO2022 single-shift functions (single-shift-2 and
1184 single-shift-3). */
1185
1186 #define ENCODE_SINGLE_SHIFT_2 \
1187 do { \
1188 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1189 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1190 else \
1191 *dst++ = ISO_CODE_SS2; \
1192 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1193 } while (0)
1194
1195 #define ENCODE_SINGLE_SHIFT_3 \
1196 do { \
1197 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1198 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1199 else \
1200 *dst++ = ISO_CODE_SS3; \
1201 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1202 } while (0)
1203
1204 /* The following four macros produce codes (control character or
1205 escape sequence) for ISO2022 locking-shift functions (shift-in,
1206 shift-out, locking-shift-2, and locking-shift-3). */
1207
1208 #define ENCODE_SHIFT_IN \
1209 do { \
1210 *dst++ = ISO_CODE_SI; \
1211 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1212 } while (0)
1213
1214 #define ENCODE_SHIFT_OUT \
1215 do { \
1216 *dst++ = ISO_CODE_SO; \
1217 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1218 } while (0)
1219
1220 #define ENCODE_LOCKING_SHIFT_2 \
1221 do { \
1222 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1223 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1224 } while (0)
1225
1226 #define ENCODE_LOCKING_SHIFT_3 \
1227 do { \
1228 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1229 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1230 } while (0)
1231
1232 /* Produce codes for a DIMENSION1 character whose character set is
1233 CHARSET and whose position-code is C1. Designation and invocation
1234 sequences are also produced in advance if necessary. */
1235
1236
1237 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1238 do { \
1239 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1240 { \
1241 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1242 *dst++ = c1 & 0x7F; \
1243 else \
1244 *dst++ = c1 | 0x80; \
1245 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1246 break; \
1247 } \
1248 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1249 { \
1250 *dst++ = c1 & 0x7F; \
1251 break; \
1252 } \
1253 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1254 { \
1255 *dst++ = c1 | 0x80; \
1256 break; \
1257 } \
1258 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1259 && !coding->safe_charsets[charset]) \
1260 { \
1261 /* We should not encode this character, instead produce one or \
1262 two `?'s. */ \
1263 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1264 if (CHARSET_WIDTH (charset) == 2) \
1265 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1266 break; \
1267 } \
1268 else \
1269 /* Since CHARSET is not yet invoked to any graphic planes, we \
1270 must invoke it, or, at first, designate it to some graphic \
1271 register. Then repeat the loop to actually produce the \
1272 character. */ \
1273 dst = encode_invocation_designation (charset, coding, dst); \
1274 } while (1)
1275
1276 /* Produce codes for a DIMENSION2 character whose character set is
1277 CHARSET and whose position-codes are C1 and C2. Designation and
1278 invocation codes are also produced in advance if necessary. */
1279
1280 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1281 do { \
1282 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1283 { \
1284 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1285 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1286 else \
1287 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1288 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1289 break; \
1290 } \
1291 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1292 { \
1293 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1294 break; \
1295 } \
1296 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1297 { \
1298 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1299 break; \
1300 } \
1301 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1302 && !coding->safe_charsets[charset]) \
1303 { \
1304 /* We should not encode this character, instead produce one or \
1305 two `?'s. */ \
1306 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1307 if (CHARSET_WIDTH (charset) == 2) \
1308 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1309 break; \
1310 } \
1311 else \
1312 /* Since CHARSET is not yet invoked to any graphic planes, we \
1313 must invoke it, or, at first, designate it to some graphic \
1314 register. Then repeat the loop to actually produce the \
1315 character. */ \
1316 dst = encode_invocation_designation (charset, coding, dst); \
1317 } while (1)
1318
1319 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1320 do { \
1321 int c_alt, charset_alt; \
1322 if (!NILP (unification_table) \
1323 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1324 >= 0)) \
1325 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1326 else \
1327 charset_alt = charset; \
1328 if (CHARSET_DIMENSION (charset_alt) == 1) \
1329 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1330 else \
1331 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1332 } while (0)
1333
1334 /* Produce designation and invocation codes at a place pointed by DST
1335 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1336 Return new DST. */
1337
1338 unsigned char *
1339 encode_invocation_designation (charset, coding, dst)
1340 int charset;
1341 struct coding_system *coding;
1342 unsigned char *dst;
1343 {
1344 int reg; /* graphic register number */
1345
1346 /* At first, check designations. */
1347 for (reg = 0; reg < 4; reg++)
1348 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1349 break;
1350
1351 if (reg >= 4)
1352 {
1353 /* CHARSET is not yet designated to any graphic registers. */
1354 /* At first check the requested designation. */
1355 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1356 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1357 /* Since CHARSET requests no special designation, designate it
1358 to graphic register 0. */
1359 reg = 0;
1360
1361 ENCODE_DESIGNATION (charset, reg, coding);
1362 }
1363
1364 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1365 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1366 {
1367 /* Since the graphic register REG is not invoked to any graphic
1368 planes, invoke it to graphic plane 0. */
1369 switch (reg)
1370 {
1371 case 0: /* graphic register 0 */
1372 ENCODE_SHIFT_IN;
1373 break;
1374
1375 case 1: /* graphic register 1 */
1376 ENCODE_SHIFT_OUT;
1377 break;
1378
1379 case 2: /* graphic register 2 */
1380 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1381 ENCODE_SINGLE_SHIFT_2;
1382 else
1383 ENCODE_LOCKING_SHIFT_2;
1384 break;
1385
1386 case 3: /* graphic register 3 */
1387 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1388 ENCODE_SINGLE_SHIFT_3;
1389 else
1390 ENCODE_LOCKING_SHIFT_3;
1391 break;
1392 }
1393 }
1394 return dst;
1395 }
1396
1397 /* The following two macros produce codes for indicating composition. */
1398 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1399 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1400 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1401
1402 /* The following three macros produce codes for indicating direction
1403 of text. */
1404 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1405 do { \
1406 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1407 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1408 else \
1409 *dst++ = ISO_CODE_CSI; \
1410 } while (0)
1411
1412 #define ENCODE_DIRECTION_R2L \
1413 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1414
1415 #define ENCODE_DIRECTION_L2R \
1416 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1417
1418 /* Produce codes for designation and invocation to reset the graphic
1419 planes and registers to initial state. */
1420 #define ENCODE_RESET_PLANE_AND_REGISTER \
1421 do { \
1422 int reg; \
1423 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1424 ENCODE_SHIFT_IN; \
1425 for (reg = 0; reg < 4; reg++) \
1426 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1427 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1428 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1429 ENCODE_DESIGNATION \
1430 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1431 } while (0)
1432
1433 /* Produce designation sequences of charsets in the line started from
1434 *SRC to a place pointed by DSTP.
1435
1436 If the current block ends before any end-of-line, we may fail to
1437 find all the necessary *designations. */
1438 encode_designation_at_bol (coding, table, src, src_end, dstp)
1439 struct coding_system *coding;
1440 Lisp_Object table;
1441 unsigned char *src, *src_end, **dstp;
1442 {
1443 int charset, c, found = 0, reg;
1444 /* Table of charsets to be designated to each graphic register. */
1445 int r[4];
1446 unsigned char *dst = *dstp;
1447
1448 for (reg = 0; reg < 4; reg++)
1449 r[reg] = -1;
1450
1451 while (src < src_end && *src != '\n' && found < 4)
1452 {
1453 int bytes = BYTES_BY_CHAR_HEAD (*src);
1454
1455 if (NILP (table))
1456 charset = CHARSET_AT (src);
1457 else
1458 {
1459 int c_alt;
1460 unsigned char c1, c2;
1461
1462 SPLIT_STRING(src, bytes, charset, c1, c2);
1463 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1464 charset = CHAR_CHARSET (c_alt);
1465 }
1466
1467 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1468 if (r[reg] < 0)
1469 {
1470 found++;
1471 r[reg] = charset;
1472 }
1473
1474 src += bytes;
1475 }
1476
1477 if (found)
1478 {
1479 for (reg = 0; reg < 4; reg++)
1480 if (r[reg] >= 0
1481 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1482 ENCODE_DESIGNATION (r[reg], reg, coding);
1483 *dstp = dst;
1484 }
1485 }
1486
1487 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1488
1489 int
1490 encode_coding_iso2022 (coding, source, destination,
1491 src_bytes, dst_bytes, consumed)
1492 struct coding_system *coding;
1493 unsigned char *source, *destination;
1494 int src_bytes, dst_bytes;
1495 int *consumed;
1496 {
1497 unsigned char *src = source;
1498 unsigned char *src_end = source + src_bytes;
1499 unsigned char *dst = destination;
1500 unsigned char *dst_end = destination + dst_bytes;
1501 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1502 from DST_END to assure overflow checking is necessary only at the
1503 head of loop. */
1504 unsigned char *adjusted_dst_end = dst_end - 19;
1505 Lisp_Object unification_table
1506 = coding->character_unification_table_for_encode;
1507
1508 if (!NILP (Venable_character_unification) && NILP (unification_table))
1509 unification_table = Vstandard_character_unification_table_for_encode;
1510
1511 while (src < src_end && dst < adjusted_dst_end)
1512 {
1513 /* SRC_BASE remembers the start position in source in each loop.
1514 The loop will be exited when there's not enough source text
1515 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1516 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1517 reset to SRC_BASE before exiting. */
1518 unsigned char *src_base = src;
1519 int charset, c1, c2, c3, c4;
1520
1521 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1522 && CODING_SPEC_ISO_BOL (coding))
1523 {
1524 /* We have to produce designation sequences if any now. */
1525 encode_designation_at_bol (coding, unification_table,
1526 src, src_end, &dst);
1527 CODING_SPEC_ISO_BOL (coding) = 0;
1528 }
1529
1530 c1 = *src++;
1531 /* If we are seeing a component of a composite character, we are
1532 seeing a leading-code specially encoded for composition, or a
1533 composition rule if composing with rule. We must set C1
1534 to a normal leading-code or an ASCII code. If we are not at
1535 a composed character, we must reset the composition state. */
1536 if (COMPOSING_P (coding->composing))
1537 {
1538 if (c1 < 0xA0)
1539 {
1540 /* We are not in a composite character any longer. */
1541 coding->composing = COMPOSING_NO;
1542 ENCODE_COMPOSITION_END;
1543 }
1544 else
1545 {
1546 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1547 {
1548 *dst++ = c1 & 0x7F;
1549 coding->composing = COMPOSING_WITH_RULE_HEAD;
1550 continue;
1551 }
1552 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1553 coding->composing = COMPOSING_WITH_RULE_RULE;
1554 if (c1 == 0xA0)
1555 {
1556 /* This is an ASCII component. */
1557 ONE_MORE_BYTE (c1);
1558 c1 &= 0x7F;
1559 }
1560 else
1561 /* This is a leading-code of non ASCII component. */
1562 c1 -= 0x20;
1563 }
1564 }
1565
1566 /* Now encode one character. C1 is a control character, an
1567 ASCII character, or a leading-code of multi-byte character. */
1568 switch (emacs_code_class[c1])
1569 {
1570 case EMACS_ascii_code:
1571 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1572 break;
1573
1574 case EMACS_control_code:
1575 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1576 ENCODE_RESET_PLANE_AND_REGISTER;
1577 *dst++ = c1;
1578 break;
1579
1580 case EMACS_carriage_return_code:
1581 if (!coding->selective)
1582 {
1583 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1584 ENCODE_RESET_PLANE_AND_REGISTER;
1585 *dst++ = c1;
1586 break;
1587 }
1588 /* fall down to treat '\r' as '\n' ... */
1589
1590 case EMACS_linefeed_code:
1591 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1592 ENCODE_RESET_PLANE_AND_REGISTER;
1593 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1594 bcopy (coding->spec.iso2022.initial_designation,
1595 coding->spec.iso2022.current_designation,
1596 sizeof coding->spec.iso2022.initial_designation);
1597 if (coding->eol_type == CODING_EOL_LF
1598 || coding->eol_type == CODING_EOL_UNDECIDED)
1599 *dst++ = ISO_CODE_LF;
1600 else if (coding->eol_type == CODING_EOL_CRLF)
1601 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1602 else
1603 *dst++ = ISO_CODE_CR;
1604 CODING_SPEC_ISO_BOL (coding) = 1;
1605 break;
1606
1607 case EMACS_leading_code_2:
1608 ONE_MORE_BYTE (c2);
1609 if (c2 < 0xA0)
1610 {
1611 /* invalid sequence */
1612 *dst++ = c1;
1613 *dst++ = c2;
1614 }
1615 else
1616 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1617 break;
1618
1619 case EMACS_leading_code_3:
1620 TWO_MORE_BYTES (c2, c3);
1621 if (c2 < 0xA0 || c3 < 0xA0)
1622 {
1623 /* invalid sequence */
1624 *dst++ = c1;
1625 *dst++ = c2;
1626 *dst++ = c3;
1627 }
1628 else if (c1 < LEADING_CODE_PRIVATE_11)
1629 ENCODE_ISO_CHARACTER (c1, c2, c3);
1630 else
1631 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1632 break;
1633
1634 case EMACS_leading_code_4:
1635 THREE_MORE_BYTES (c2, c3, c4);
1636 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1637 {
1638 /* invalid sequence */
1639 *dst++ = c1;
1640 *dst++ = c2;
1641 *dst++ = c3;
1642 *dst++ = c4;
1643 }
1644 else
1645 ENCODE_ISO_CHARACTER (c2, c3, c4);
1646 break;
1647
1648 case EMACS_leading_code_composition:
1649 ONE_MORE_BYTE (c2);
1650 if (c2 < 0xA0)
1651 {
1652 /* invalid sequence */
1653 *dst++ = c1;
1654 *dst++ = c2;
1655 }
1656 else if (c2 == 0xFF)
1657 {
1658 coding->composing = COMPOSING_WITH_RULE_HEAD;
1659 ENCODE_COMPOSITION_WITH_RULE_START;
1660 }
1661 else
1662 {
1663 /* Rewind one byte because it is a character code of
1664 composition elements. */
1665 src--;
1666 coding->composing = COMPOSING_NO_RULE_HEAD;
1667 ENCODE_COMPOSITION_NO_RULE_START;
1668 }
1669 break;
1670
1671 case EMACS_invalid_code:
1672 *dst++ = c1;
1673 break;
1674 }
1675 continue;
1676 label_end_of_loop:
1677 /* We reach here because the source date ends not at character
1678 boundary. */
1679 coding->carryover_size = src_end - src_base;
1680 bcopy (src_base, coding->carryover, coding->carryover_size);
1681 src = src_end;
1682 break;
1683 }
1684
1685 /* If this is the last block of the text to be encoded, we must
1686 reset graphic planes and registers to the initial state. */
1687 if (src >= src_end && coding->last_block)
1688 {
1689 ENCODE_RESET_PLANE_AND_REGISTER;
1690 if (coding->carryover_size > 0
1691 && coding->carryover_size < (dst_end - dst))
1692 {
1693 bcopy (coding->carryover, dst, coding->carryover_size);
1694 dst += coding->carryover_size;
1695 coding->carryover_size = 0;
1696 }
1697 }
1698 *consumed = src - source;
1699 return dst - destination;
1700 }
1701
1702 \f
1703 /*** 4. SJIS and BIG5 handlers ***/
1704
1705 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1706 quite widely. So, for the moment, Emacs supports them in the bare
1707 C code. But, in the future, they may be supported only by CCL. */
1708
1709 /* SJIS is a coding system encoding three character sets: ASCII, right
1710 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1711 as is. A character of charset katakana-jisx0201 is encoded by
1712 "position-code + 0x80". A character of charset japanese-jisx0208
1713 is encoded in 2-byte but two position-codes are divided and shifted
1714 so that it fit in the range below.
1715
1716 --- CODE RANGE of SJIS ---
1717 (character set) (range)
1718 ASCII 0x00 .. 0x7F
1719 KATAKANA-JISX0201 0xA0 .. 0xDF
1720 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1721 (2nd byte) 0x40 .. 0xFF
1722 -------------------------------
1723
1724 */
1725
1726 /* BIG5 is a coding system encoding two character sets: ASCII and
1727 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1728 character set and is encoded in two-byte.
1729
1730 --- CODE RANGE of BIG5 ---
1731 (character set) (range)
1732 ASCII 0x00 .. 0x7F
1733 Big5 (1st byte) 0xA1 .. 0xFE
1734 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1735 --------------------------
1736
1737 Since the number of characters in Big5 is larger than maximum
1738 characters in Emacs' charset (96x96), it can't be handled as one
1739 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1740 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1741 contains frequently used characters and the latter contains less
1742 frequently used characters. */
1743
1744 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1745 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1746 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1747 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1748
1749 /* Number of Big5 characters which have the same code in 1st byte. */
1750 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1751
1752 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1753 do { \
1754 unsigned int temp \
1755 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1756 if (b1 < 0xC9) \
1757 charset = charset_big5_1; \
1758 else \
1759 { \
1760 charset = charset_big5_2; \
1761 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1762 } \
1763 c1 = temp / (0xFF - 0xA1) + 0x21; \
1764 c2 = temp % (0xFF - 0xA1) + 0x21; \
1765 } while (0)
1766
1767 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1768 do { \
1769 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1770 if (charset == charset_big5_2) \
1771 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1772 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1773 b2 = temp % BIG5_SAME_ROW; \
1774 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1775 } while (0)
1776
1777 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1778 do { \
1779 int c_alt, charset_alt = (charset); \
1780 if (!NILP (unification_table) \
1781 && ((c_alt = unify_char (unification_table, \
1782 -1, (charset), c1, c2)) >= 0)) \
1783 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1784 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1785 DECODE_CHARACTER_ASCII (c1); \
1786 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1787 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1788 else \
1789 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1790 } while (0)
1791
1792 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1793 do { \
1794 int c_alt, charset_alt; \
1795 if (!NILP (unification_table) \
1796 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1797 >= 0)) \
1798 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1799 else \
1800 charset_alt = charset; \
1801 if (charset_alt == charset_ascii) \
1802 *dst++ = c1; \
1803 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1804 { \
1805 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1806 *dst++ = c1; \
1807 else \
1808 *dst++ = charset_alt, *dst++ = c1; \
1809 } \
1810 else \
1811 { \
1812 c1 &= 0x7F, c2 &= 0x7F; \
1813 if (sjis_p && charset_alt == charset_jisx0208) \
1814 { \
1815 unsigned char s1, s2; \
1816 \
1817 ENCODE_SJIS (c1, c2, s1, s2); \
1818 *dst++ = s1, *dst++ = s2; \
1819 } \
1820 else if (!sjis_p \
1821 && (charset_alt == charset_big5_1 \
1822 || charset_alt == charset_big5_2)) \
1823 { \
1824 unsigned char b1, b2; \
1825 \
1826 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1827 *dst++ = b1, *dst++ = b2; \
1828 } \
1829 else \
1830 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1831 } \
1832 } while (0);
1833
1834 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1835 Check if a text is encoded in SJIS. If it is, return
1836 CODING_CATEGORY_MASK_SJIS, else return 0. */
1837
1838 int
1839 detect_coding_sjis (src, src_end)
1840 unsigned char *src, *src_end;
1841 {
1842 unsigned char c;
1843
1844 while (src < src_end)
1845 {
1846 c = *src++;
1847 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1848 return 0;
1849 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1850 {
1851 if (src < src_end && *src++ < 0x40)
1852 return 0;
1853 }
1854 }
1855 return CODING_CATEGORY_MASK_SJIS;
1856 }
1857
1858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859 Check if a text is encoded in BIG5. If it is, return
1860 CODING_CATEGORY_MASK_BIG5, else return 0. */
1861
1862 int
1863 detect_coding_big5 (src, src_end)
1864 unsigned char *src, *src_end;
1865 {
1866 unsigned char c;
1867
1868 while (src < src_end)
1869 {
1870 c = *src++;
1871 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1872 return 0;
1873 if (c >= 0xA1)
1874 {
1875 if (src >= src_end)
1876 break;
1877 c = *src++;
1878 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1879 return 0;
1880 }
1881 }
1882 return CODING_CATEGORY_MASK_BIG5;
1883 }
1884
1885 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1886 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1887
1888 int
1889 decode_coding_sjis_big5 (coding, source, destination,
1890 src_bytes, dst_bytes, consumed, sjis_p)
1891 struct coding_system *coding;
1892 unsigned char *source, *destination;
1893 int src_bytes, dst_bytes;
1894 int *consumed;
1895 int sjis_p;
1896 {
1897 unsigned char *src = source;
1898 unsigned char *src_end = source + src_bytes;
1899 unsigned char *dst = destination;
1900 unsigned char *dst_end = destination + dst_bytes;
1901 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1902 from DST_END to assure overflow checking is necessary only at the
1903 head of loop. */
1904 unsigned char *adjusted_dst_end = dst_end - 3;
1905 Lisp_Object unification_table
1906 = coding->character_unification_table_for_decode;
1907
1908 if (!NILP (Venable_character_unification) && NILP (unification_table))
1909 unification_table = Vstandard_character_unification_table_for_decode;
1910
1911 while (src < src_end && dst < adjusted_dst_end)
1912 {
1913 /* SRC_BASE remembers the start position in source in each loop.
1914 The loop will be exited when there's not enough source text
1915 to analyze two-byte character (within macro ONE_MORE_BYTE).
1916 In that case, SRC is reset to SRC_BASE before exiting. */
1917 unsigned char *src_base = src;
1918 unsigned char c1 = *src++, c2, c3, c4;
1919
1920 if (c1 == '\r')
1921 {
1922 if (coding->eol_type == CODING_EOL_CRLF)
1923 {
1924 ONE_MORE_BYTE (c2);
1925 if (c2 == '\n')
1926 *dst++ = c2;
1927 else
1928 /* To process C2 again, SRC is subtracted by 1. */
1929 *dst++ = c1, src--;
1930 }
1931 else
1932 *dst++ = c1;
1933 }
1934 else if (c1 < 0x20)
1935 *dst++ = c1;
1936 else if (c1 < 0x80)
1937 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1938 else if (c1 < 0xA0 || c1 >= 0xE0)
1939 {
1940 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1941 if (sjis_p)
1942 {
1943 ONE_MORE_BYTE (c2);
1944 DECODE_SJIS (c1, c2, c3, c4);
1945 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1946 }
1947 else if (c1 >= 0xE0 && c1 < 0xFF)
1948 {
1949 int charset;
1950
1951 ONE_MORE_BYTE (c2);
1952 DECODE_BIG5 (c1, c2, charset, c3, c4);
1953 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1954 }
1955 else /* Invalid code */
1956 *dst++ = c1;
1957 }
1958 else
1959 {
1960 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1961 if (sjis_p)
1962 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1963 else
1964 {
1965 int charset;
1966
1967 ONE_MORE_BYTE (c2);
1968 DECODE_BIG5 (c1, c2, charset, c3, c4);
1969 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1970 }
1971 }
1972 continue;
1973
1974 label_end_of_loop:
1975 coding->carryover_size = src - src_base;
1976 bcopy (src_base, coding->carryover, coding->carryover_size);
1977 src = src_base;
1978 break;
1979 }
1980
1981 *consumed = src - source;
1982 return dst - destination;
1983 }
1984
1985 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1986 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1987 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1988 sure that all these charsets are registered as official charset
1989 (i.e. do not have extended leading-codes). Characters of other
1990 charsets are produced without any encoding. If SJIS_P is 1, encode
1991 SJIS text, else encode BIG5 text. */
1992
1993 int
1994 encode_coding_sjis_big5 (coding, source, destination,
1995 src_bytes, dst_bytes, consumed, sjis_p)
1996 struct coding_system *coding;
1997 unsigned char *source, *destination;
1998 int src_bytes, dst_bytes;
1999 int *consumed;
2000 int sjis_p;
2001 {
2002 unsigned char *src = source;
2003 unsigned char *src_end = source + src_bytes;
2004 unsigned char *dst = destination;
2005 unsigned char *dst_end = destination + dst_bytes;
2006 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2007 from DST_END to assure overflow checking is necessary only at the
2008 head of loop. */
2009 unsigned char *adjusted_dst_end = dst_end - 1;
2010 Lisp_Object unification_table
2011 = coding->character_unification_table_for_encode;
2012
2013 if (!NILP (Venable_character_unification) && NILP (unification_table))
2014 unification_table = Vstandard_character_unification_table_for_encode;
2015
2016 while (src < src_end && dst < adjusted_dst_end)
2017 {
2018 /* SRC_BASE remembers the start position in source in each loop.
2019 The loop will be exited when there's not enough source text
2020 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2021 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2022 before exiting. */
2023 unsigned char *src_base = src;
2024 unsigned char c1 = *src++, c2, c3, c4;
2025
2026 if (coding->composing)
2027 {
2028 if (c1 == 0xA0)
2029 {
2030 ONE_MORE_BYTE (c1);
2031 c1 &= 0x7F;
2032 }
2033 else if (c1 >= 0xA0)
2034 c1 -= 0x20;
2035 else
2036 coding->composing = 0;
2037 }
2038
2039 switch (emacs_code_class[c1])
2040 {
2041 case EMACS_ascii_code:
2042 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2043 break;
2044
2045 case EMACS_control_code:
2046 *dst++ = c1;
2047 break;
2048
2049 case EMACS_carriage_return_code:
2050 if (!coding->selective)
2051 {
2052 *dst++ = c1;
2053 break;
2054 }
2055 /* fall down to treat '\r' as '\n' ... */
2056
2057 case EMACS_linefeed_code:
2058 if (coding->eol_type == CODING_EOL_LF
2059 || coding->eol_type == CODING_EOL_UNDECIDED)
2060 *dst++ = '\n';
2061 else if (coding->eol_type == CODING_EOL_CRLF)
2062 *dst++ = '\r', *dst++ = '\n';
2063 else
2064 *dst++ = '\r';
2065 break;
2066
2067 case EMACS_leading_code_2:
2068 ONE_MORE_BYTE (c2);
2069 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2070 break;
2071
2072 case EMACS_leading_code_3:
2073 TWO_MORE_BYTES (c2, c3);
2074 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2075 break;
2076
2077 case EMACS_leading_code_4:
2078 THREE_MORE_BYTES (c2, c3, c4);
2079 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2080 break;
2081
2082 case EMACS_leading_code_composition:
2083 coding->composing = 1;
2084 break;
2085
2086 default: /* i.e. case EMACS_invalid_code: */
2087 *dst++ = c1;
2088 }
2089 continue;
2090
2091 label_end_of_loop:
2092 coding->carryover_size = src_end - src_base;
2093 bcopy (src_base, coding->carryover, coding->carryover_size);
2094 src = src_end;
2095 break;
2096 }
2097
2098 *consumed = src - source;
2099 return dst - destination;
2100 }
2101
2102 \f
2103 /*** 5. End-of-line handlers ***/
2104
2105 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2106 This function is called only when `coding->eol_type' is
2107 CODING_EOL_CRLF or CODING_EOL_CR. */
2108
2109 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2110 struct coding_system *coding;
2111 unsigned char *source, *destination;
2112 int src_bytes, dst_bytes;
2113 int *consumed;
2114 {
2115 unsigned char *src = source;
2116 unsigned char *src_end = source + src_bytes;
2117 unsigned char *dst = destination;
2118 unsigned char *dst_end = destination + dst_bytes;
2119 int produced;
2120
2121 switch (coding->eol_type)
2122 {
2123 case CODING_EOL_CRLF:
2124 {
2125 /* Since the maximum bytes produced by each loop is 2, we
2126 subtract 1 from DST_END to assure overflow checking is
2127 necessary only at the head of loop. */
2128 unsigned char *adjusted_dst_end = dst_end - 1;
2129
2130 while (src < src_end && dst < adjusted_dst_end)
2131 {
2132 unsigned char *src_base = src;
2133 unsigned char c = *src++;
2134 if (c == '\r')
2135 {
2136 ONE_MORE_BYTE (c);
2137 if (c != '\n')
2138 *dst++ = '\r';
2139 *dst++ = c;
2140 }
2141 else
2142 *dst++ = c;
2143 continue;
2144
2145 label_end_of_loop:
2146 coding->carryover_size = src - src_base;
2147 bcopy (src_base, coding->carryover, coding->carryover_size);
2148 src = src_base;
2149 break;
2150 }
2151 *consumed = src - source;
2152 produced = dst - destination;
2153 break;
2154 }
2155
2156 case CODING_EOL_CR:
2157 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2158 bcopy (source, destination, produced);
2159 dst_end = destination + produced;
2160 while (dst < dst_end)
2161 if (*dst++ == '\r') dst[-1] = '\n';
2162 *consumed = produced;
2163 break;
2164
2165 default: /* i.e. case: CODING_EOL_LF */
2166 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2167 bcopy (source, destination, produced);
2168 *consumed = produced;
2169 break;
2170 }
2171
2172 return produced;
2173 }
2174
2175 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2176 format of end-of-line according to `coding->eol_type'. If
2177 `coding->selective' is 1, code '\r' in source text also means
2178 end-of-line. */
2179
2180 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2181 struct coding_system *coding;
2182 unsigned char *source, *destination;
2183 int src_bytes, dst_bytes;
2184 int *consumed;
2185 {
2186 unsigned char *src = source;
2187 unsigned char *dst = destination;
2188 int produced;
2189
2190 if (src_bytes <= 0)
2191 return 0;
2192
2193 switch (coding->eol_type)
2194 {
2195 case CODING_EOL_LF:
2196 case CODING_EOL_UNDECIDED:
2197 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2198 bcopy (source, destination, produced);
2199 if (coding->selective)
2200 {
2201 int i = produced;
2202 while (i--)
2203 if (*dst++ == '\r') dst[-1] = '\n';
2204 }
2205 *consumed = produced;
2206
2207 case CODING_EOL_CRLF:
2208 {
2209 unsigned char c;
2210 unsigned char *src_end = source + src_bytes;
2211 unsigned char *dst_end = destination + dst_bytes;
2212 /* Since the maximum bytes produced by each loop is 2, we
2213 subtract 1 from DST_END to assure overflow checking is
2214 necessary only at the head of loop. */
2215 unsigned char *adjusted_dst_end = dst_end - 1;
2216
2217 while (src < src_end && dst < adjusted_dst_end)
2218 {
2219 c = *src++;
2220 if (c == '\n' || (c == '\r' && coding->selective))
2221 *dst++ = '\r', *dst++ = '\n';
2222 else
2223 *dst++ = c;
2224 }
2225 produced = dst - destination;
2226 *consumed = src - source;
2227 break;
2228 }
2229
2230 default: /* i.e. case CODING_EOL_CR: */
2231 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2232 bcopy (source, destination, produced);
2233 {
2234 int i = produced;
2235 while (i--)
2236 if (*dst++ == '\n') dst[-1] = '\r';
2237 }
2238 *consumed = produced;
2239 }
2240
2241 return produced;
2242 }
2243
2244 \f
2245 /*** 6. C library functions ***/
2246
2247 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2248 has a property `coding-system'. The value of this property is a
2249 vector of length 5 (called as coding-vector). Among elements of
2250 this vector, the first (element[0]) and the fifth (element[4])
2251 carry important information for decoding/encoding. Before
2252 decoding/encoding, this information should be set in fields of a
2253 structure of type `coding_system'.
2254
2255 A value of property `coding-system' can be a symbol of another
2256 subsidiary coding-system. In that case, Emacs gets coding-vector
2257 from that symbol.
2258
2259 `element[0]' contains information to be set in `coding->type'. The
2260 value and its meaning is as follows:
2261
2262 0 -- coding_type_emacs_mule
2263 1 -- coding_type_sjis
2264 2 -- coding_type_iso2022
2265 3 -- coding_type_big5
2266 4 -- coding_type_ccl encoder/decoder written in CCL
2267 nil -- coding_type_no_conversion
2268 t -- coding_type_undecided (automatic conversion on decoding,
2269 no-conversion on encoding)
2270
2271 `element[4]' contains information to be set in `coding->flags' and
2272 `coding->spec'. The meaning varies by `coding->type'.
2273
2274 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2275 of length 32 (of which the first 13 sub-elements are used now).
2276 Meanings of these sub-elements are:
2277
2278 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2279 If the value is an integer of valid charset, the charset is
2280 assumed to be designated to graphic register N initially.
2281
2282 If the value is minus, it is a minus value of charset which
2283 reserves graphic register N, which means that the charset is
2284 not designated initially but should be designated to graphic
2285 register N just before encoding a character in that charset.
2286
2287 If the value is nil, graphic register N is never used on
2288 encoding.
2289
2290 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2291 Each value takes t or nil. See the section ISO2022 of
2292 `coding.h' for more information.
2293
2294 If `coding->type' is `coding_type_big5', element[4] is t to denote
2295 BIG5-ETen or nil to denote BIG5-HKU.
2296
2297 If `coding->type' takes the other value, element[4] is ignored.
2298
2299 Emacs Lisp's coding system also carries information about format of
2300 end-of-line in a value of property `eol-type'. If the value is
2301 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2302 means CODING_EOL_CR. If it is not integer, it should be a vector
2303 of subsidiary coding systems of which property `eol-type' has one
2304 of above values.
2305
2306 */
2307
2308 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2309 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2310 is setup so that no conversion is necessary and return -1, else
2311 return 0. */
2312
2313 int
2314 setup_coding_system (coding_system, coding)
2315 Lisp_Object coding_system;
2316 struct coding_system *coding;
2317 {
2318 Lisp_Object coding_spec, plist, type, eol_type;
2319 Lisp_Object val;
2320 int i;
2321
2322 /* At first, set several fields to default values. */
2323 coding->last_block = 0;
2324 coding->selective = 0;
2325 coding->composing = 0;
2326 coding->direction = 0;
2327 coding->carryover_size = 0;
2328 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2329 coding->character_unification_table_for_decode = Qnil;
2330 coding->character_unification_table_for_encode = Qnil;
2331
2332 Vlast_coding_system_used = coding->symbol = coding_system;
2333 eol_type = Qnil;
2334
2335 /* Get values of property `coding-system' and `eol-type'.
2336 Also get values of coding system properties:
2337 `post-read-conversion', `pre-write-conversion',
2338 `character-unification-table-for-decode',
2339 `character-unification-table-for-encode'. */
2340 coding_spec = Fget (coding_system, Qcoding_system);
2341 if (!VECTORP (coding_spec)
2342 || XVECTOR (coding_spec)->size != 5
2343 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2344 goto label_invalid_coding_system;
2345 if (!inhibit_eol_conversion)
2346 eol_type = Fget (coding_system, Qeol_type);
2347
2348 plist = XVECTOR (coding_spec)->contents[3];
2349 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2350 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2351 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2352 if (SYMBOLP (val))
2353 val = Fget (val, Qcharacter_unification_table_for_decode);
2354 coding->character_unification_table_for_decode
2355 = CHAR_TABLE_P (val) ? val : Qnil;
2356 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2357 if (SYMBOLP (val))
2358 val = Fget (val, Qcharacter_unification_table_for_encode);
2359 coding->character_unification_table_for_encode
2360 = CHAR_TABLE_P (val) ? val : Qnil;
2361
2362 val = Fplist_get (plist, Qsafe_charsets);
2363 if (EQ (val, Qt))
2364 {
2365 for (i = 0; i <= MAX_CHARSET; i++)
2366 coding->safe_charsets[i] = 1;
2367 }
2368 else
2369 {
2370 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2371 while (CONSP (val))
2372 {
2373 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2374 coding->safe_charsets[i] = 1;
2375 val = XCONS (val)->cdr;
2376 }
2377 }
2378
2379 if (VECTORP (eol_type))
2380 {
2381 coding->eol_type = CODING_EOL_UNDECIDED;
2382 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2383 }
2384 else if (XFASTINT (eol_type) == 1)
2385 {
2386 coding->eol_type = CODING_EOL_CRLF;
2387 coding->common_flags
2388 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2389 }
2390 else if (XFASTINT (eol_type) == 2)
2391 {
2392 coding->eol_type = CODING_EOL_CR;
2393 coding->common_flags
2394 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2395 }
2396 else
2397 {
2398 coding->eol_type = CODING_EOL_LF;
2399 coding->common_flags = 0;
2400 }
2401
2402 type = XVECTOR (coding_spec)->contents[0];
2403 switch (XFASTINT (type))
2404 {
2405 case 0:
2406 coding->type = coding_type_emacs_mule;
2407 if (!NILP (coding->post_read_conversion))
2408 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2409 if (!NILP (coding->pre_write_conversion))
2410 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2411 break;
2412
2413 case 1:
2414 coding->type = coding_type_sjis;
2415 coding->common_flags
2416 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2417 break;
2418
2419 case 2:
2420 coding->type = coding_type_iso2022;
2421 coding->common_flags
2422 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2423 {
2424 Lisp_Object val, temp;
2425 Lisp_Object *flags;
2426 int i, charset, default_reg_bits = 0;
2427
2428 val = XVECTOR (coding_spec)->contents[4];
2429
2430 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2431 goto label_invalid_coding_system;
2432
2433 flags = XVECTOR (val)->contents;
2434 coding->flags
2435 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2436 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2437 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2438 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2439 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2440 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2441 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2442 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2443 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2444 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2445 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2446 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2447 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2448 );
2449
2450 /* Invoke graphic register 0 to plane 0. */
2451 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2452 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2453 CODING_SPEC_ISO_INVOCATION (coding, 1)
2454 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2455 /* Not single shifting at first. */
2456 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2457 /* Beginning of buffer should also be regarded as bol. */
2458 CODING_SPEC_ISO_BOL (coding) = 1;
2459
2460 for (charset = 0; charset <= MAX_CHARSET; charset++)
2461 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2462 val = Vcharset_revision_alist;
2463 while (CONSP (val))
2464 {
2465 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2466 if (charset >= 0
2467 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2468 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2469 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2470 val = XCONS (val)->cdr;
2471 }
2472
2473 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2474 FLAGS[REG] can be one of below:
2475 integer CHARSET: CHARSET occupies register I,
2476 t: designate nothing to REG initially, but can be used
2477 by any charsets,
2478 list of integer, nil, or t: designate the first
2479 element (if integer) to REG initially, the remaining
2480 elements (if integer) is designated to REG on request,
2481 if an element is t, REG can be used by any charset,
2482 nil: REG is never used. */
2483 for (charset = 0; charset <= MAX_CHARSET; charset++)
2484 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2485 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2486 for (i = 0; i < 4; i++)
2487 {
2488 if (INTEGERP (flags[i])
2489 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2490 || (charset = get_charset_id (flags[i])) >= 0)
2491 {
2492 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2493 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2494 }
2495 else if (EQ (flags[i], Qt))
2496 {
2497 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2498 default_reg_bits |= 1 << i;
2499 }
2500 else if (CONSP (flags[i]))
2501 {
2502 Lisp_Object tail = flags[i];
2503
2504 if (INTEGERP (XCONS (tail)->car)
2505 && (charset = XINT (XCONS (tail)->car),
2506 CHARSET_VALID_P (charset))
2507 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2508 {
2509 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2510 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2511 }
2512 else
2513 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2514 tail = XCONS (tail)->cdr;
2515 while (CONSP (tail))
2516 {
2517 if (INTEGERP (XCONS (tail)->car)
2518 && (charset = XINT (XCONS (tail)->car),
2519 CHARSET_VALID_P (charset))
2520 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2521 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2522 = i;
2523 else if (EQ (XCONS (tail)->car, Qt))
2524 default_reg_bits |= 1 << i;
2525 tail = XCONS (tail)->cdr;
2526 }
2527 }
2528 else
2529 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2530
2531 CODING_SPEC_ISO_DESIGNATION (coding, i)
2532 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2533 }
2534
2535 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2536 {
2537 /* REG 1 can be used only by locking shift in 7-bit env. */
2538 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2539 default_reg_bits &= ~2;
2540 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2541 /* Without any shifting, only REG 0 and 1 can be used. */
2542 default_reg_bits &= 3;
2543 }
2544
2545 for (charset = 0; charset <= MAX_CHARSET; charset++)
2546 if (CHARSET_VALID_P (charset)
2547 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2548 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2549 {
2550 /* We have not yet decided where to designate CHARSET. */
2551 int reg_bits = default_reg_bits;
2552
2553 if (CHARSET_CHARS (charset) == 96)
2554 /* A charset of CHARS96 can't be designated to REG 0. */
2555 reg_bits &= ~1;
2556
2557 if (reg_bits)
2558 /* There exist some default graphic register. */
2559 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2560 = (reg_bits & 1
2561 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2562 else
2563 /* We anyway have to designate CHARSET to somewhere. */
2564 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2565 = (CHARSET_CHARS (charset) == 94
2566 ? 0
2567 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2568 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2569 ? 1
2570 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2571 ? 2 : 0)));
2572 }
2573 }
2574 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2575 break;
2576
2577 case 3:
2578 coding->type = coding_type_big5;
2579 coding->common_flags
2580 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2581 coding->flags
2582 = (NILP (XVECTOR (coding_spec)->contents[4])
2583 ? CODING_FLAG_BIG5_HKU
2584 : CODING_FLAG_BIG5_ETEN);
2585 break;
2586
2587 case 4:
2588 coding->type = coding_type_ccl;
2589 coding->common_flags
2590 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2591 {
2592 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
2593 if (CONSP (val)
2594 && VECTORP (XCONS (val)->car)
2595 && VECTORP (XCONS (val)->cdr))
2596 {
2597 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2598 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2599 }
2600 else
2601 goto label_invalid_coding_system;
2602 }
2603 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2604 break;
2605
2606 case 5:
2607 coding->type = coding_type_raw_text;
2608 break;
2609
2610 default:
2611 if (EQ (type, Qt))
2612 {
2613 coding->type = coding_type_undecided;
2614 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2615 }
2616 else
2617 coding->type = coding_type_no_conversion;
2618 break;
2619 }
2620 return 0;
2621
2622 label_invalid_coding_system:
2623 coding->type = coding_type_no_conversion;
2624 coding->common_flags = 0;
2625 coding->eol_type = CODING_EOL_LF;
2626 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2627 = Qnil;
2628 return -1;
2629 }
2630
2631 /* Emacs has a mechanism to automatically detect a coding system if it
2632 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2633 it's impossible to distinguish some coding systems accurately
2634 because they use the same range of codes. So, at first, coding
2635 systems are categorized into 7, those are:
2636
2637 o coding-category-emacs-mule
2638
2639 The category for a coding system which has the same code range
2640 as Emacs' internal format. Assigned the coding-system (Lisp
2641 symbol) `emacs-mule' by default.
2642
2643 o coding-category-sjis
2644
2645 The category for a coding system which has the same code range
2646 as SJIS. Assigned the coding-system (Lisp
2647 symbol) `japanese-shift-jis' by default.
2648
2649 o coding-category-iso-7
2650
2651 The category for a coding system which has the same code range
2652 as ISO2022 of 7-bit environment. This doesn't use any locking
2653 shift and single shift functions. Assigned the coding-system
2654 (Lisp symbol) `iso-2022-7bit' by default.
2655
2656 o coding-category-iso-8-1
2657
2658 The category for a coding system which has the same code range
2659 as ISO2022 of 8-bit environment and graphic plane 1 used only
2660 for DIMENSION1 charset. This doesn't use any locking shift
2661 and single shift functions. Assigned the coding-system (Lisp
2662 symbol) `iso-latin-1' by default.
2663
2664 o coding-category-iso-8-2
2665
2666 The category for a coding system which has the same code range
2667 as ISO2022 of 8-bit environment and graphic plane 1 used only
2668 for DIMENSION2 charset. This doesn't use any locking shift
2669 and single shift functions. Assigned the coding-system (Lisp
2670 symbol) `japanese-iso-8bit' by default.
2671
2672 o coding-category-iso-7-else
2673
2674 The category for a coding system which has the same code range
2675 as ISO2022 of 7-bit environemnt but uses locking shift or
2676 single shift functions. Assigned the coding-system (Lisp
2677 symbol) `iso-2022-7bit-lock' by default.
2678
2679 o coding-category-iso-8-else
2680
2681 The category for a coding system which has the same code range
2682 as ISO2022 of 8-bit environemnt but uses locking shift or
2683 single shift functions. Assigned the coding-system (Lisp
2684 symbol) `iso-2022-8bit-ss2' by default.
2685
2686 o coding-category-big5
2687
2688 The category for a coding system which has the same code range
2689 as BIG5. Assigned the coding-system (Lisp symbol)
2690 `cn-big5' by default.
2691
2692 o coding-category-binary
2693
2694 The category for a coding system not categorized in any of the
2695 above. Assigned the coding-system (Lisp symbol)
2696 `no-conversion' by default.
2697
2698 Each of them is a Lisp symbol and the value is an actual
2699 `coding-system's (this is also a Lisp symbol) assigned by a user.
2700 What Emacs does actually is to detect a category of coding system.
2701 Then, it uses a `coding-system' assigned to it. If Emacs can't
2702 decide only one possible category, it selects a category of the
2703 highest priority. Priorities of categories are also specified by a
2704 user in a Lisp variable `coding-category-list'.
2705
2706 */
2707
2708 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2709 If it detects possible coding systems, return an integer in which
2710 appropriate flag bits are set. Flag bits are defined by macros
2711 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2712
2713 int
2714 detect_coding_mask (src, src_bytes)
2715 unsigned char *src;
2716 int src_bytes;
2717 {
2718 register unsigned char c;
2719 unsigned char *src_end = src + src_bytes;
2720 int mask;
2721
2722 /* At first, skip all ASCII characters and control characters except
2723 for three ISO2022 specific control characters. */
2724 label_loop_detect_coding:
2725 while (src < src_end)
2726 {
2727 c = *src;
2728 if (c >= 0x80
2729 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2730 break;
2731 src++;
2732 }
2733
2734 if (src >= src_end)
2735 /* We found nothing other than ASCII. There's nothing to do. */
2736 return CODING_CATEGORY_MASK_ANY;
2737
2738 /* The text seems to be encoded in some multilingual coding system.
2739 Now, try to find in which coding system the text is encoded. */
2740 if (c < 0x80)
2741 {
2742 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2743 /* C is an ISO2022 specific control code of C0. */
2744 mask = detect_coding_iso2022 (src, src_end);
2745 src++;
2746 if (mask == 0)
2747 /* No valid ISO2022 code follows C. Try again. */
2748 goto label_loop_detect_coding;
2749 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2750 }
2751 else if (c < 0xA0)
2752 {
2753 /* If C is a special latin extra code,
2754 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2755 or is an ISO2022 control-sequence-introducer (CSI),
2756 we should also consider the possibility of ISO2022 codings. */
2757 if ((VECTORP (Vlatin_extra_code_table)
2758 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2759 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2760 || (c == ISO_CODE_CSI
2761 && (src < src_end
2762 && (*src == ']'
2763 || (src + 1 < src_end
2764 && src[1] == ']'
2765 && (*src == '0' || *src == '1' || *src == '2'))))))
2766 mask = (detect_coding_iso2022 (src, src_end)
2767 | detect_coding_sjis (src, src_end)
2768 | detect_coding_emacs_mule (src, src_end)
2769 | CODING_CATEGORY_MASK_RAW_TEXT);
2770
2771 else
2772 /* C is the first byte of SJIS character code,
2773 or a leading-code of Emacs' internal format (emacs-mule). */
2774 mask = (detect_coding_sjis (src, src_end)
2775 | detect_coding_emacs_mule (src, src_end)
2776 | CODING_CATEGORY_MASK_RAW_TEXT);
2777 }
2778 else
2779 /* C is a character of ISO2022 in graphic plane right,
2780 or a SJIS's 1-byte character code (i.e. JISX0201),
2781 or the first byte of BIG5's 2-byte code. */
2782 mask = (detect_coding_iso2022 (src, src_end)
2783 | detect_coding_sjis (src, src_end)
2784 | detect_coding_big5 (src, src_end)
2785 | CODING_CATEGORY_MASK_RAW_TEXT);
2786
2787 return mask;
2788 }
2789
2790 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2791 The information of the detected coding system is set in CODING. */
2792
2793 void
2794 detect_coding (coding, src, src_bytes)
2795 struct coding_system *coding;
2796 unsigned char *src;
2797 int src_bytes;
2798 {
2799 int mask = detect_coding_mask (src, src_bytes);
2800 int idx;
2801 Lisp_Object val = Vcoding_category_list;
2802
2803 if (mask == CODING_CATEGORY_MASK_ANY)
2804 /* We found nothing other than ASCII. There's nothing to do. */
2805 return;
2806
2807 /* We found some plausible coding systems. Let's use a coding
2808 system of the highest priority. */
2809
2810 if (CONSP (val))
2811 while (!NILP (val))
2812 {
2813 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2814 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2815 break;
2816 val = XCONS (val)->cdr;
2817 }
2818 else
2819 val = Qnil;
2820
2821 if (NILP (val))
2822 {
2823 /* For unknown reason, `Vcoding_category_list' contains none of
2824 found categories. Let's use any of them. */
2825 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2826 if (mask & (1 << idx))
2827 break;
2828 }
2829 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2830 }
2831
2832 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2833 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2834 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2835
2836 #define MAX_EOL_CHECK_COUNT 3
2837
2838 int
2839 detect_eol_type (src, src_bytes)
2840 unsigned char *src;
2841 int src_bytes;
2842 {
2843 unsigned char *src_end = src + src_bytes;
2844 unsigned char c;
2845 int total = 0; /* How many end-of-lines are found so far. */
2846 int eol_type = CODING_EOL_UNDECIDED;
2847 int this_eol_type;
2848
2849 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2850 {
2851 c = *src++;
2852 if (c == '\n' || c == '\r')
2853 {
2854 total++;
2855 if (c == '\n')
2856 this_eol_type = CODING_EOL_LF;
2857 else if (src >= src_end || *src != '\n')
2858 this_eol_type = CODING_EOL_CR;
2859 else
2860 this_eol_type = CODING_EOL_CRLF, src++;
2861
2862 if (eol_type == CODING_EOL_UNDECIDED)
2863 /* This is the first end-of-line. */
2864 eol_type = this_eol_type;
2865 else if (eol_type != this_eol_type)
2866 /* The found type is different from what found before.
2867 Let's notice the caller about this inconsistency. */
2868 return CODING_EOL_INCONSISTENT;
2869 }
2870 }
2871
2872 return eol_type;
2873 }
2874
2875 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2876 is encoded. If it detects an appropriate format of end-of-line, it
2877 sets the information in *CODING. */
2878
2879 void
2880 detect_eol (coding, src, src_bytes)
2881 struct coding_system *coding;
2882 unsigned char *src;
2883 int src_bytes;
2884 {
2885 Lisp_Object val;
2886 int eol_type = detect_eol_type (src, src_bytes);
2887
2888 if (eol_type == CODING_EOL_UNDECIDED)
2889 /* We found no end-of-line in the source text. */
2890 return;
2891
2892 if (eol_type == CODING_EOL_INCONSISTENT)
2893 {
2894 #if 0
2895 /* This code is suppressed until we find a better way to
2896 distinguish raw text file and binary file. */
2897
2898 /* If we have already detected that the coding is raw-text, the
2899 coding should actually be no-conversion. */
2900 if (coding->type == coding_type_raw_text)
2901 {
2902 setup_coding_system (Qno_conversion, coding);
2903 return;
2904 }
2905 /* Else, let's decode only text code anyway. */
2906 #endif /* 0 */
2907 eol_type = CODING_EOL_LF;
2908 }
2909
2910 val = Fget (coding->symbol, Qeol_type);
2911 if (VECTORP (val) && XVECTOR (val)->size == 3)
2912 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2913 }
2914
2915 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2916 decoding, it may detect coding system and format of end-of-line if
2917 those are not yet decided. */
2918
2919 int
2920 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2921 struct coding_system *coding;
2922 unsigned char *source, *destination;
2923 int src_bytes, dst_bytes;
2924 int *consumed;
2925 {
2926 int produced;
2927
2928 if (src_bytes <= 0)
2929 {
2930 *consumed = 0;
2931 return 0;
2932 }
2933
2934 if (coding->type == coding_type_undecided)
2935 detect_coding (coding, source, src_bytes);
2936
2937 if (coding->eol_type == CODING_EOL_UNDECIDED)
2938 detect_eol (coding, source, src_bytes);
2939
2940 coding->carryover_size = 0;
2941 switch (coding->type)
2942 {
2943 case coding_type_no_conversion:
2944 label_no_conversion:
2945 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2946 bcopy (source, destination, produced);
2947 *consumed = produced;
2948 break;
2949
2950 case coding_type_emacs_mule:
2951 case coding_type_undecided:
2952 case coding_type_raw_text:
2953 if (coding->eol_type == CODING_EOL_LF
2954 || coding->eol_type == CODING_EOL_UNDECIDED)
2955 goto label_no_conversion;
2956 produced = decode_eol (coding, source, destination,
2957 src_bytes, dst_bytes, consumed);
2958 break;
2959
2960 case coding_type_sjis:
2961 produced = decode_coding_sjis_big5 (coding, source, destination,
2962 src_bytes, dst_bytes, consumed,
2963 1);
2964 break;
2965
2966 case coding_type_iso2022:
2967 produced = decode_coding_iso2022 (coding, source, destination,
2968 src_bytes, dst_bytes, consumed);
2969 break;
2970
2971 case coding_type_big5:
2972 produced = decode_coding_sjis_big5 (coding, source, destination,
2973 src_bytes, dst_bytes, consumed,
2974 0);
2975 break;
2976
2977 case coding_type_ccl:
2978 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2979 src_bytes, dst_bytes, consumed);
2980 break;
2981 }
2982
2983 return produced;
2984 }
2985
2986 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2987
2988 int
2989 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2990 struct coding_system *coding;
2991 unsigned char *source, *destination;
2992 int src_bytes, dst_bytes;
2993 int *consumed;
2994 {
2995 int produced;
2996
2997 switch (coding->type)
2998 {
2999 case coding_type_no_conversion:
3000 label_no_conversion:
3001 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
3002 if (produced > 0)
3003 {
3004 bcopy (source, destination, produced);
3005 if (coding->selective)
3006 {
3007 unsigned char *p = destination, *pend = destination + produced;
3008 while (p < pend)
3009 if (*p++ == '\015') p[-1] = '\n';
3010 }
3011 }
3012 *consumed = produced;
3013 break;
3014
3015 case coding_type_emacs_mule:
3016 case coding_type_undecided:
3017 case coding_type_raw_text:
3018 if (coding->eol_type == CODING_EOL_LF
3019 || coding->eol_type == CODING_EOL_UNDECIDED)
3020 goto label_no_conversion;
3021 produced = encode_eol (coding, source, destination,
3022 src_bytes, dst_bytes, consumed);
3023 break;
3024
3025 case coding_type_sjis:
3026 produced = encode_coding_sjis_big5 (coding, source, destination,
3027 src_bytes, dst_bytes, consumed,
3028 1);
3029 break;
3030
3031 case coding_type_iso2022:
3032 produced = encode_coding_iso2022 (coding, source, destination,
3033 src_bytes, dst_bytes, consumed);
3034 break;
3035
3036 case coding_type_big5:
3037 produced = encode_coding_sjis_big5 (coding, source, destination,
3038 src_bytes, dst_bytes, consumed,
3039 0);
3040 break;
3041
3042 case coding_type_ccl:
3043 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
3044 src_bytes, dst_bytes, consumed);
3045 break;
3046 }
3047
3048 return produced;
3049 }
3050
3051 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3052
3053 /* Return maximum size (bytes) of a buffer enough for decoding
3054 SRC_BYTES of text encoded in CODING. */
3055
3056 int
3057 decoding_buffer_size (coding, src_bytes)
3058 struct coding_system *coding;
3059 int src_bytes;
3060 {
3061 int magnification;
3062
3063 if (coding->type == coding_type_iso2022)
3064 magnification = 3;
3065 else if (coding->type == coding_type_ccl)
3066 magnification = coding->spec.ccl.decoder.buf_magnification;
3067 else
3068 magnification = 2;
3069
3070 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3071 }
3072
3073 /* Return maximum size (bytes) of a buffer enough for encoding
3074 SRC_BYTES of text to CODING. */
3075
3076 int
3077 encoding_buffer_size (coding, src_bytes)
3078 struct coding_system *coding;
3079 int src_bytes;
3080 {
3081 int magnification;
3082
3083 if (coding->type == coding_type_ccl)
3084 magnification = coding->spec.ccl.encoder.buf_magnification;
3085 else
3086 magnification = 3;
3087
3088 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3089 }
3090
3091 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3092 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3093 #endif
3094
3095 char *conversion_buffer;
3096 int conversion_buffer_size;
3097
3098 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3099 or decoding. Sufficient memory is allocated automatically. If we
3100 run out of memory, return NULL. */
3101
3102 char *
3103 get_conversion_buffer (size)
3104 int size;
3105 {
3106 if (size > conversion_buffer_size)
3107 {
3108 char *buf;
3109 int real_size = conversion_buffer_size * 2;
3110
3111 while (real_size < size) real_size *= 2;
3112 buf = (char *) xmalloc (real_size);
3113 xfree (conversion_buffer);
3114 conversion_buffer = buf;
3115 conversion_buffer_size = real_size;
3116 }
3117 return conversion_buffer;
3118 }
3119
3120 \f
3121 #ifdef emacs
3122 /*** 7. Emacs Lisp library functions ***/
3123
3124 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3125 "Return t if OBJECT is nil or a coding-system.\n\
3126 See document of make-coding-system for coding-system object.")
3127 (obj)
3128 Lisp_Object obj;
3129 {
3130 if (NILP (obj))
3131 return Qt;
3132 if (!SYMBOLP (obj))
3133 return Qnil;
3134 /* Get coding-spec vector for OBJ. */
3135 obj = Fget (obj, Qcoding_system);
3136 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
3137 ? Qt : Qnil);
3138 }
3139
3140 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3141 Sread_non_nil_coding_system, 1, 1, 0,
3142 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3143 (prompt)
3144 Lisp_Object prompt;
3145 {
3146 Lisp_Object val;
3147 do
3148 {
3149 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3150 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
3151 }
3152 while (XSTRING (val)->size == 0);
3153 return (Fintern (val, Qnil));
3154 }
3155
3156 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3157 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3158 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3159 (prompt, default_coding_system)
3160 Lisp_Object prompt, default_coding_system;
3161 {
3162 Lisp_Object val;
3163 if (SYMBOLP (default_coding_system))
3164 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
3165 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3166 Qt, Qnil, Qcoding_system_history,
3167 default_coding_system, Qnil);
3168 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3169 }
3170
3171 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3172 1, 1, 0,
3173 "Check validity of CODING-SYSTEM.\n\
3174 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3175 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3176 The value of property should be a vector of length 5.")
3177 (coding_system)
3178 Lisp_Object coding_system;
3179 {
3180 CHECK_SYMBOL (coding_system, 0);
3181 if (!NILP (Fcoding_system_p (coding_system)))
3182 return coding_system;
3183 while (1)
3184 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3185 }
3186
3187 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3188 2, 2, 0,
3189 "Detect coding system of the text in the region between START and END.\n\
3190 Return a list of possible coding systems ordered by priority.\n\
3191 If only ASCII characters are found, it returns `undecided'\n\
3192 or its subsidiary coding system according to a detected end-of-line format.")
3193 (b, e)
3194 Lisp_Object b, e;
3195 {
3196 int coding_mask, eol_type;
3197 Lisp_Object val;
3198 int beg, end;
3199
3200 validate_region (&b, &e);
3201 beg = XINT (b), end = XINT (e);
3202 if (beg < GPT && end >= GPT) move_gap (end);
3203
3204 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3205 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3206
3207 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3208 {
3209 val = Qundecided;
3210 if (eol_type != CODING_EOL_UNDECIDED
3211 && eol_type != CODING_EOL_INCONSISTENT)
3212 {
3213 Lisp_Object val2;
3214 val2 = Fget (Qundecided, Qeol_type);
3215 if (VECTORP (val2))
3216 val = XVECTOR (val2)->contents[eol_type];
3217 }
3218 }
3219 else
3220 {
3221 Lisp_Object val2;
3222
3223 /* At first, gather possible coding-systems in VAL in a reverse
3224 order. */
3225 val = Qnil;
3226 for (val2 = Vcoding_category_list;
3227 !NILP (val2);
3228 val2 = XCONS (val2)->cdr)
3229 {
3230 int idx
3231 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3232 if (coding_mask & (1 << idx))
3233 {
3234 #if 0
3235 /* This code is suppressed until we find a better way to
3236 distinguish raw text file and binary file. */
3237
3238 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3239 && eol_type == CODING_EOL_INCONSISTENT)
3240 val = Fcons (Qno_conversion, val);
3241 else
3242 #endif /* 0 */
3243 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3244 }
3245 }
3246
3247 /* Then, change the order of the list, while getting subsidiary
3248 coding-systems. */
3249 val2 = val;
3250 val = Qnil;
3251 if (eol_type == CODING_EOL_INCONSISTENT)
3252 eol_type == CODING_EOL_UNDECIDED;
3253 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3254 {
3255 if (eol_type == CODING_EOL_UNDECIDED)
3256 val = Fcons (XCONS (val2)->car, val);
3257 else
3258 {
3259 Lisp_Object val3;
3260 val3 = Fget (XCONS (val2)->car, Qeol_type);
3261 if (VECTORP (val3))
3262 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3263 else
3264 val = Fcons (XCONS (val2)->car, val);
3265 }
3266 }
3267 }
3268
3269 return val;
3270 }
3271
3272 /* Scan text in the region between *BEGP and *ENDP, skip characters
3273 which we never have to encode to (iff ENCODEP is 1) or decode from
3274 coding system CODING at the head and tail, then set BEGP and ENDP
3275 to the addresses of start and end of the text we actually convert. */
3276
3277 void
3278 shrink_conversion_area (begp, endp, coding, encodep)
3279 unsigned char **begp, **endp;
3280 struct coding_system *coding;
3281 int encodep;
3282 {
3283 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3284
3285 if (coding->eol_type != CODING_EOL_LF
3286 && coding->eol_type != CODING_EOL_UNDECIDED)
3287 /* Since we anyway have to convert end-of-line format, it is not
3288 worth skipping at most 100 bytes or so. */
3289 return;
3290
3291 if (encodep) /* for encoding */
3292 {
3293 switch (coding->type)
3294 {
3295 case coding_type_no_conversion:
3296 case coding_type_emacs_mule:
3297 case coding_type_undecided:
3298 case coding_type_raw_text:
3299 /* We need no conversion. */
3300 *begp = *endp;
3301 return;
3302 case coding_type_ccl:
3303 /* We can't skip any data. */
3304 return;
3305 case coding_type_iso2022:
3306 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3307 {
3308 unsigned char *bol = beg_addr;
3309 while (beg_addr < end_addr && *beg_addr < 0x80)
3310 {
3311 beg_addr++;
3312 if (*(beg_addr - 1) == '\n')
3313 bol = beg_addr;
3314 }
3315 beg_addr = bol;
3316 goto label_skip_tail;
3317 }
3318 /* fall down ... */
3319 default:
3320 /* We can skip all ASCII characters at the head and tail. */
3321 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3322 label_skip_tail:
3323 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3324 break;
3325 }
3326 }
3327 else /* for decoding */
3328 {
3329 switch (coding->type)
3330 {
3331 case coding_type_no_conversion:
3332 /* We need no conversion. */
3333 *begp = *endp;
3334 return;
3335 case coding_type_emacs_mule:
3336 case coding_type_raw_text:
3337 if (coding->eol_type == CODING_EOL_LF)
3338 {
3339 /* We need no conversion. */
3340 *begp = *endp;
3341 return;
3342 }
3343 /* We can skip all but carriage-return. */
3344 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3345 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3346 break;
3347 case coding_type_sjis:
3348 case coding_type_big5:
3349 /* We can skip all ASCII characters at the head. */
3350 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3351 /* We can skip all ASCII characters at the tail except for
3352 the second byte of SJIS or BIG5 code. */
3353 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3354 if (end_addr != *endp)
3355 end_addr++;
3356 break;
3357 case coding_type_ccl:
3358 /* We can't skip any data. */
3359 return;
3360 default: /* i.e. case coding_type_iso2022: */
3361 {
3362 unsigned char c;
3363
3364 /* We can skip all ASCII characters except for a few
3365 control codes at the head. */
3366 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3367 && c != ISO_CODE_CR && c != ISO_CODE_SO
3368 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3369 beg_addr++;
3370 }
3371 break;
3372 }
3373 }
3374 *begp = beg_addr;
3375 *endp = end_addr;
3376 return;
3377 }
3378
3379 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3380 text between B and E. B and E are buffer position. */
3381
3382 Lisp_Object
3383 code_convert_region (b, e, coding, encodep)
3384 Lisp_Object b, e;
3385 struct coding_system *coding;
3386 int encodep;
3387 {
3388 int beg, end, len, consumed, produced;
3389 char *buf;
3390 unsigned char *begp, *endp;
3391 int pos = PT;
3392
3393 validate_region (&b, &e);
3394 beg = XINT (b), end = XINT (e);
3395 if (beg < GPT && end >= GPT)
3396 move_gap (end);
3397
3398 if (encodep && !NILP (coding->pre_write_conversion))
3399 {
3400 /* We must call a pre-conversion function which may put a new
3401 text to be converted in a new buffer. */
3402 struct buffer *old = current_buffer, *new;
3403
3404 TEMP_SET_PT (beg);
3405 call2 (coding->pre_write_conversion, b, e);
3406 if (old != current_buffer)
3407 {
3408 /* Replace the original text by the text just generated. */
3409 len = ZV - BEGV;
3410 new = current_buffer;
3411 set_buffer_internal (old);
3412 del_range (beg, end);
3413 insert_from_buffer (new, 1, len, 0);
3414 end = beg + len;
3415 }
3416 }
3417
3418 /* We may be able to shrink the conversion region. */
3419 begp = POS_ADDR (beg); endp = begp + (end - beg);
3420 shrink_conversion_area (&begp, &endp, coding, encodep);
3421
3422 if (begp == endp)
3423 /* We need no conversion. */
3424 len = end - beg;
3425 else
3426 {
3427 beg += begp - POS_ADDR (beg);
3428 end = beg + (endp - begp);
3429
3430 if (encodep)
3431 len = encoding_buffer_size (coding, end - beg);
3432 else
3433 len = decoding_buffer_size (coding, end - beg);
3434 buf = get_conversion_buffer (len);
3435
3436 coding->last_block = 1;
3437 produced = (encodep
3438 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3439 &consumed)
3440 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3441 &consumed));
3442
3443 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3444
3445 TEMP_SET_PT (beg);
3446 insert (buf, produced);
3447 del_range (PT, PT + end - beg);
3448 if (pos >= end)
3449 pos = PT + (pos - end);
3450 else if (pos > beg)
3451 pos = beg;
3452 TEMP_SET_PT (pos);
3453 }
3454
3455 if (!encodep && !NILP (coding->post_read_conversion))
3456 {
3457 /* We must call a post-conversion function which may alter
3458 the text just converted. */
3459 Lisp_Object insval;
3460
3461 beg = XINT (b);
3462 TEMP_SET_PT (beg);
3463 insval = call1 (coding->post_read_conversion, make_number (len));
3464 CHECK_NUMBER (insval, 0);
3465 if (pos >= beg + len)
3466 pos += XINT (insval) - len;
3467 else if (pos > beg)
3468 pos = beg;
3469 TEMP_SET_PT (pos);
3470 len = XINT (insval);
3471 }
3472
3473 return make_number (len);
3474 }
3475
3476 Lisp_Object
3477 code_convert_string (str, coding, encodep, nocopy)
3478 Lisp_Object str, nocopy;
3479 struct coding_system *coding;
3480 int encodep;
3481 {
3482 int len, consumed, produced;
3483 char *buf;
3484 unsigned char *begp, *endp;
3485 int head_skip, tail_skip;
3486 struct gcpro gcpro1;
3487
3488 if (encodep && !NILP (coding->pre_write_conversion)
3489 || !encodep && !NILP (coding->post_read_conversion))
3490 {
3491 /* Since we have to call Lisp functions which assume target text
3492 is in a buffer, after setting a temporary buffer, call
3493 code_convert_region. */
3494 int count = specpdl_ptr - specpdl;
3495 int len = XSTRING (str)->size;
3496 Lisp_Object result;
3497 struct buffer *old = current_buffer;
3498
3499 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3500 temp_output_buffer_setup (" *code-converting-work*");
3501 set_buffer_internal (XBUFFER (Vstandard_output));
3502 insert_from_string (str, 0, len, 0);
3503 code_convert_region (make_number (BEGV), make_number (ZV),
3504 coding, encodep);
3505 result = make_buffer_string (BEGV, ZV, 0);
3506 set_buffer_internal (old);
3507 return unbind_to (count, result);
3508 }
3509
3510 /* We may be able to shrink the conversion region. */
3511 begp = XSTRING (str)->data;
3512 endp = begp + XSTRING (str)->size;
3513 shrink_conversion_area (&begp, &endp, coding, encodep);
3514
3515 if (begp == endp)
3516 /* We need no conversion. */
3517 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3518
3519 head_skip = begp - XSTRING (str)->data;
3520 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3521
3522 GCPRO1 (str);
3523
3524 if (encodep)
3525 len = encoding_buffer_size (coding, endp - begp);
3526 else
3527 len = decoding_buffer_size (coding, endp - begp);
3528 buf = get_conversion_buffer (len + head_skip + tail_skip);
3529
3530 bcopy (XSTRING (str)->data, buf, head_skip);
3531 coding->last_block = 1;
3532 produced = (encodep
3533 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3534 buf + head_skip, endp - begp, len, &consumed)
3535 : decode_coding (coding, XSTRING (str)->data + head_skip,
3536 buf + head_skip, endp - begp, len, &consumed));
3537 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3538 buf + head_skip + produced,
3539 tail_skip);
3540
3541 UNGCPRO;
3542
3543 return make_string (buf, head_skip + produced + tail_skip);
3544 }
3545
3546 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3547 3, 3, "r\nzCoding system: ",
3548 "Decode current region by specified coding system.\n\
3549 When called from a program, takes three arguments:\n\
3550 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3551 Return length of decoded text.")
3552 (b, e, coding_system)
3553 Lisp_Object b, e, coding_system;
3554 {
3555 struct coding_system coding;
3556
3557 CHECK_NUMBER_COERCE_MARKER (b, 0);
3558 CHECK_NUMBER_COERCE_MARKER (e, 1);
3559 CHECK_SYMBOL (coding_system, 2);
3560
3561 if (NILP (coding_system))
3562 return make_number (XFASTINT (e) - XFASTINT (b));
3563 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3564 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3565
3566 return code_convert_region (b, e, &coding, 0);
3567 }
3568
3569 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3570 3, 3, "r\nzCoding system: ",
3571 "Encode current region by specified coding system.\n\
3572 When called from a program, takes three arguments:\n\
3573 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3574 Return length of encoded text.")
3575 (b, e, coding_system)
3576 Lisp_Object b, e, coding_system;
3577 {
3578 struct coding_system coding;
3579
3580 CHECK_NUMBER_COERCE_MARKER (b, 0);
3581 CHECK_NUMBER_COERCE_MARKER (e, 1);
3582 CHECK_SYMBOL (coding_system, 2);
3583
3584 if (NILP (coding_system))
3585 return make_number (XFASTINT (e) - XFASTINT (b));
3586 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3587 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3588
3589 return code_convert_region (b, e, &coding, 1);
3590 }
3591
3592 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3593 2, 3, 0,
3594 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3595 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3596 if the decoding operation is trivial.")
3597 (string, coding_system, nocopy)
3598 Lisp_Object string, coding_system, nocopy;
3599 {
3600 struct coding_system coding;
3601
3602 CHECK_STRING (string, 0);
3603 CHECK_SYMBOL (coding_system, 1);
3604
3605 if (NILP (coding_system))
3606 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3607 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3608 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3609
3610 return code_convert_string (string, &coding, 0, nocopy);
3611 }
3612
3613 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3614 2, 3, 0,
3615 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3616 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3617 if the encoding operation is trivial.")
3618 (string, coding_system, nocopy)
3619 Lisp_Object string, coding_system, nocopy;
3620 {
3621 struct coding_system coding;
3622
3623 CHECK_STRING (string, 0);
3624 CHECK_SYMBOL (coding_system, 1);
3625
3626 if (NILP (coding_system))
3627 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3628 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3629 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3630
3631 return code_convert_string (string, &coding, 1, nocopy);
3632 }
3633
3634 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3635 "Decode a JISX0208 character of shift-jis encoding.\n\
3636 CODE is the character code in SJIS.\n\
3637 Return the corresponding character.")
3638 (code)
3639 Lisp_Object code;
3640 {
3641 unsigned char c1, c2, s1, s2;
3642 Lisp_Object val;
3643
3644 CHECK_NUMBER (code, 0);
3645 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3646 DECODE_SJIS (s1, s2, c1, c2);
3647 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3648 return val;
3649 }
3650
3651 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3652 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3653 Return the corresponding character code in SJIS.")
3654 (ch)
3655 Lisp_Object ch;
3656 {
3657 int charset, c1, c2, s1, s2;
3658 Lisp_Object val;
3659
3660 CHECK_NUMBER (ch, 0);
3661 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3662 if (charset == charset_jisx0208)
3663 {
3664 ENCODE_SJIS (c1, c2, s1, s2);
3665 XSETFASTINT (val, (s1 << 8) | s2);
3666 }
3667 else
3668 XSETFASTINT (val, 0);
3669 return val;
3670 }
3671
3672 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3673 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3674 CODE is the character code in BIG5.\n\
3675 Return the corresponding character.")
3676 (code)
3677 Lisp_Object code;
3678 {
3679 int charset;
3680 unsigned char b1, b2, c1, c2;
3681 Lisp_Object val;
3682
3683 CHECK_NUMBER (code, 0);
3684 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3685 DECODE_BIG5 (b1, b2, charset, c1, c2);
3686 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3687 return val;
3688 }
3689
3690 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3691 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3692 Return the corresponding character code in Big5.")
3693 (ch)
3694 Lisp_Object ch;
3695 {
3696 int charset, c1, c2, b1, b2;
3697 Lisp_Object val;
3698
3699 CHECK_NUMBER (ch, 0);
3700 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3701 if (charset == charset_big5_1 || charset == charset_big5_2)
3702 {
3703 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3704 XSETFASTINT (val, (b1 << 8) | b2);
3705 }
3706 else
3707 XSETFASTINT (val, 0);
3708 return val;
3709 }
3710
3711 DEFUN ("set-terminal-coding-system-internal",
3712 Fset_terminal_coding_system_internal,
3713 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3714 (coding_system)
3715 Lisp_Object coding_system;
3716 {
3717 CHECK_SYMBOL (coding_system, 0);
3718 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3719 /* We had better not send unsafe characters to terminal. */
3720 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3721
3722 return Qnil;
3723 }
3724
3725 DEFUN ("set-safe-terminal-coding-system-internal",
3726 Fset_safe_terminal_coding_system_internal,
3727 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3728 (coding_system)
3729 Lisp_Object coding_system;
3730 {
3731 CHECK_SYMBOL (coding_system, 0);
3732 setup_coding_system (Fcheck_coding_system (coding_system),
3733 &safe_terminal_coding);
3734 return Qnil;
3735 }
3736
3737 DEFUN ("terminal-coding-system",
3738 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3739 "Return coding-system of your terminal.")
3740 ()
3741 {
3742 return terminal_coding.symbol;
3743 }
3744
3745 DEFUN ("set-keyboard-coding-system-internal",
3746 Fset_keyboard_coding_system_internal,
3747 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3748 (coding_system)
3749 Lisp_Object coding_system;
3750 {
3751 CHECK_SYMBOL (coding_system, 0);
3752 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3753 return Qnil;
3754 }
3755
3756 DEFUN ("keyboard-coding-system",
3757 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3758 "Return coding-system of what is sent from terminal keyboard.")
3759 ()
3760 {
3761 return keyboard_coding.symbol;
3762 }
3763
3764 \f
3765 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3766 Sfind_operation_coding_system, 1, MANY, 0,
3767 "Choose a coding system for an operation based on the target name.\n\
3768 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3769 DECODING-SYSTEM is the coding system to use for decoding\n\
3770 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3771 for encoding (in case OPERATION does encoding).\n\
3772 \n\
3773 The first argument OPERATION specifies an I/O primitive:\n\
3774 For file I/O, `insert-file-contents' or `write-region'.\n\
3775 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3776 For network I/O, `open-network-stream'.\n\
3777 \n\
3778 The remaining arguments should be the same arguments that were passed\n\
3779 to the primitive. Depending on which primitive, one of those arguments\n\
3780 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3781 whichever argument specifies the file name is TARGET.\n\
3782 \n\
3783 TARGET has a meaning which depends on OPERATION:\n\
3784 For file I/O, TARGET is a file name.\n\
3785 For process I/O, TARGET is a process name.\n\
3786 For network I/O, TARGET is a service name or a port number\n\
3787 \n\
3788 This function looks up what specified for TARGET in,\n\
3789 `file-coding-system-alist', `process-coding-system-alist',\n\
3790 or `network-coding-system-alist' depending on OPERATION.\n\
3791 They may specify a coding system, a cons of coding systems,\n\
3792 or a function symbol to call.\n\
3793 In the last case, we call the function with one argument,\n\
3794 which is a list of all the arguments given to this function.")
3795 (nargs, args)
3796 int nargs;
3797 Lisp_Object *args;
3798 {
3799 Lisp_Object operation, target_idx, target, val;
3800 register Lisp_Object chain;
3801
3802 if (nargs < 2)
3803 error ("Too few arguments");
3804 operation = args[0];
3805 if (!SYMBOLP (operation)
3806 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3807 error ("Invalid first arguement");
3808 if (nargs < 1 + XINT (target_idx))
3809 error ("Too few arguments for operation: %s",
3810 XSYMBOL (operation)->name->data);
3811 target = args[XINT (target_idx) + 1];
3812 if (!(STRINGP (target)
3813 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3814 error ("Invalid %dth argument", XINT (target_idx) + 1);
3815
3816 chain = ((EQ (operation, Qinsert_file_contents)
3817 || EQ (operation, Qwrite_region))
3818 ? Vfile_coding_system_alist
3819 : (EQ (operation, Qopen_network_stream)
3820 ? Vnetwork_coding_system_alist
3821 : Vprocess_coding_system_alist));
3822 if (NILP (chain))
3823 return Qnil;
3824
3825 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3826 {
3827 Lisp_Object elt;
3828 elt = XCONS (chain)->car;
3829
3830 if (CONSP (elt)
3831 && ((STRINGP (target)
3832 && STRINGP (XCONS (elt)->car)
3833 && fast_string_match (XCONS (elt)->car, target) >= 0)
3834 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3835 {
3836 val = XCONS (elt)->cdr;
3837 /* Here, if VAL is both a valid coding system and a valid
3838 function symbol, we return VAL as a coding system. */
3839 if (CONSP (val))
3840 return val;
3841 if (! SYMBOLP (val))
3842 return Qnil;
3843 if (! NILP (Fcoding_system_p (val)))
3844 return Fcons (val, val);
3845 if (! NILP (Ffboundp (val)))
3846 {
3847 val = call1 (val, Flist (nargs, args));
3848 if (CONSP (val))
3849 return val;
3850 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3851 return Fcons (val, val);
3852 }
3853 return Qnil;
3854 }
3855 }
3856 return Qnil;
3857 }
3858
3859 #endif /* emacs */
3860
3861 \f
3862 /*** 8. Post-amble ***/
3863
3864 init_coding_once ()
3865 {
3866 int i;
3867
3868 /* Emacs' internal format specific initialize routine. */
3869 for (i = 0; i <= 0x20; i++)
3870 emacs_code_class[i] = EMACS_control_code;
3871 emacs_code_class[0x0A] = EMACS_linefeed_code;
3872 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3873 for (i = 0x21 ; i < 0x7F; i++)
3874 emacs_code_class[i] = EMACS_ascii_code;
3875 emacs_code_class[0x7F] = EMACS_control_code;
3876 emacs_code_class[0x80] = EMACS_leading_code_composition;
3877 for (i = 0x81; i < 0xFF; i++)
3878 emacs_code_class[i] = EMACS_invalid_code;
3879 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3880 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3881 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3882 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3883
3884 /* ISO2022 specific initialize routine. */
3885 for (i = 0; i < 0x20; i++)
3886 iso_code_class[i] = ISO_control_code;
3887 for (i = 0x21; i < 0x7F; i++)
3888 iso_code_class[i] = ISO_graphic_plane_0;
3889 for (i = 0x80; i < 0xA0; i++)
3890 iso_code_class[i] = ISO_control_code;
3891 for (i = 0xA1; i < 0xFF; i++)
3892 iso_code_class[i] = ISO_graphic_plane_1;
3893 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3894 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3895 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3896 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3897 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3898 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3899 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3900 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3901 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3902 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3903
3904 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3905 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3906
3907 setup_coding_system (Qnil, &keyboard_coding);
3908 setup_coding_system (Qnil, &terminal_coding);
3909 setup_coding_system (Qnil, &safe_terminal_coding);
3910
3911 #if defined (MSDOS) || defined (WINDOWSNT)
3912 system_eol_type = CODING_EOL_CRLF;
3913 #else
3914 system_eol_type = CODING_EOL_LF;
3915 #endif
3916 }
3917
3918 #ifdef emacs
3919
3920 syms_of_coding ()
3921 {
3922 Qtarget_idx = intern ("target-idx");
3923 staticpro (&Qtarget_idx);
3924
3925 Qcoding_system_history = intern ("coding-system-history");
3926 staticpro (&Qcoding_system_history);
3927 Fset (Qcoding_system_history, Qnil);
3928
3929 /* Target FILENAME is the first argument. */
3930 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3931 /* Target FILENAME is the third argument. */
3932 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3933
3934 Qcall_process = intern ("call-process");
3935 staticpro (&Qcall_process);
3936 /* Target PROGRAM is the first argument. */
3937 Fput (Qcall_process, Qtarget_idx, make_number (0));
3938
3939 Qcall_process_region = intern ("call-process-region");
3940 staticpro (&Qcall_process_region);
3941 /* Target PROGRAM is the third argument. */
3942 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3943
3944 Qstart_process = intern ("start-process");
3945 staticpro (&Qstart_process);
3946 /* Target PROGRAM is the third argument. */
3947 Fput (Qstart_process, Qtarget_idx, make_number (2));
3948
3949 Qopen_network_stream = intern ("open-network-stream");
3950 staticpro (&Qopen_network_stream);
3951 /* Target SERVICE is the fourth argument. */
3952 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3953
3954 Qcoding_system = intern ("coding-system");
3955 staticpro (&Qcoding_system);
3956
3957 Qeol_type = intern ("eol-type");
3958 staticpro (&Qeol_type);
3959
3960 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3961 staticpro (&Qbuffer_file_coding_system);
3962
3963 Qpost_read_conversion = intern ("post-read-conversion");
3964 staticpro (&Qpost_read_conversion);
3965
3966 Qpre_write_conversion = intern ("pre-write-conversion");
3967 staticpro (&Qpre_write_conversion);
3968
3969 Qno_conversion = intern ("no-conversion");
3970 staticpro (&Qno_conversion);
3971
3972 Qundecided = intern ("undecided");
3973 staticpro (&Qundecided);
3974
3975 Qcoding_system_p = intern ("coding-system-p");
3976 staticpro (&Qcoding_system_p);
3977
3978 Qcoding_system_error = intern ("coding-system-error");
3979 staticpro (&Qcoding_system_error);
3980
3981 Fput (Qcoding_system_error, Qerror_conditions,
3982 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3983 Fput (Qcoding_system_error, Qerror_message,
3984 build_string ("Invalid coding system"));
3985
3986 Qcoding_category_index = intern ("coding-category-index");
3987 staticpro (&Qcoding_category_index);
3988
3989 {
3990 int i;
3991 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3992 {
3993 coding_category_table[i] = intern (coding_category_name[i]);
3994 staticpro (&coding_category_table[i]);
3995 Fput (coding_category_table[i], Qcoding_category_index,
3996 make_number (i));
3997 }
3998 }
3999
4000 Qcharacter_unification_table = intern ("character-unification-table");
4001 staticpro (&Qcharacter_unification_table);
4002 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
4003 make_number (0));
4004
4005 Qcharacter_unification_table_for_decode
4006 = intern ("character-unification-table-for-decode");
4007 staticpro (&Qcharacter_unification_table_for_decode);
4008
4009 Qcharacter_unification_table_for_encode
4010 = intern ("character-unification-table-for-encode");
4011 staticpro (&Qcharacter_unification_table_for_encode);
4012
4013 Qsafe_charsets = intern ("safe-charsets");
4014 staticpro (&Qsafe_charsets);
4015
4016 Qemacs_mule = intern ("emacs-mule");
4017 staticpro (&Qemacs_mule);
4018
4019 defsubr (&Scoding_system_p);
4020 defsubr (&Sread_coding_system);
4021 defsubr (&Sread_non_nil_coding_system);
4022 defsubr (&Scheck_coding_system);
4023 defsubr (&Sdetect_coding_region);
4024 defsubr (&Sdecode_coding_region);
4025 defsubr (&Sencode_coding_region);
4026 defsubr (&Sdecode_coding_string);
4027 defsubr (&Sencode_coding_string);
4028 defsubr (&Sdecode_sjis_char);
4029 defsubr (&Sencode_sjis_char);
4030 defsubr (&Sdecode_big5_char);
4031 defsubr (&Sencode_big5_char);
4032 defsubr (&Sset_terminal_coding_system_internal);
4033 defsubr (&Sset_safe_terminal_coding_system_internal);
4034 defsubr (&Sterminal_coding_system);
4035 defsubr (&Sset_keyboard_coding_system_internal);
4036 defsubr (&Skeyboard_coding_system);
4037 defsubr (&Sfind_operation_coding_system);
4038
4039 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
4040 "List of coding systems.\n\
4041 \n\
4042 Do not alter the value of this variable manually. This variable should be\n\
4043 updated by the functions `make-coding-system' and\n\
4044 `define-coding-system-alias'.");
4045 Vcoding_system_list = Qnil;
4046
4047 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
4048 "Alist of coding system names.\n\
4049 Each element is one element list of coding system name.\n\
4050 This variable is given to `completing-read' as TABLE argument.\n\
4051 \n\
4052 Do not alter the value of this variable manually. This variable should be\n\
4053 updated by the functions `make-coding-system' and\n\
4054 `define-coding-system-alias'.");
4055 Vcoding_system_alist = Qnil;
4056
4057 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
4058 "List of coding-categories (symbols) ordered by priority.");
4059 {
4060 int i;
4061
4062 Vcoding_category_list = Qnil;
4063 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4064 Vcoding_category_list
4065 = Fcons (coding_category_table[i], Vcoding_category_list);
4066 }
4067
4068 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4069 "Specify the coding system for read operations.\n\
4070 It is useful to bind this variable with `let', but do not set it globally.\n\
4071 If the value is a coding system, it is used for decoding on read operation.\n\
4072 If not, an appropriate element is used from one of the coding system alists:\n\
4073 There are three such tables, `file-coding-system-alist',\n\
4074 `process-coding-system-alist', and `network-coding-system-alist'.");
4075 Vcoding_system_for_read = Qnil;
4076
4077 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4078 "Specify the coding system for write operations.\n\
4079 It is useful to bind this variable with `let', but do not set it globally.\n\
4080 If the value is a coding system, it is used for encoding on write operation.\n\
4081 If not, an appropriate element is used from one of the coding system alists:\n\
4082 There are three such tables, `file-coding-system-alist',\n\
4083 `process-coding-system-alist', and `network-coding-system-alist'.");
4084 Vcoding_system_for_write = Qnil;
4085
4086 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4087 "Coding system used in the latest file or process I/O.");
4088 Vlast_coding_system_used = Qnil;
4089
4090 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4091 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4092 inhibit_eol_conversion = 0;
4093
4094 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4095 "Alist to decide a coding system to use for a file I/O operation.\n\
4096 The format is ((PATTERN . VAL) ...),\n\
4097 where PATTERN is a regular expression matching a file name,\n\
4098 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4099 If VAL is a coding system, it is used for both decoding and encoding\n\
4100 the file contents.\n\
4101 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4102 and the cdr part is used for encoding.\n\
4103 If VAL is a function symbol, the function must return a coding system\n\
4104 or a cons of coding systems which are used as above.\n\
4105 \n\
4106 See also the function `find-operation-coding-system'.");
4107 Vfile_coding_system_alist = Qnil;
4108
4109 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4110 "Alist to decide a coding system to use for a process I/O operation.\n\
4111 The format is ((PATTERN . VAL) ...),\n\
4112 where PATTERN is a regular expression matching a program name,\n\
4113 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4114 If VAL is a coding system, it is used for both decoding what received\n\
4115 from the program and encoding what sent to the program.\n\
4116 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4117 and the cdr part is used for encoding.\n\
4118 If VAL is a function symbol, the function must return a coding system\n\
4119 or a cons of coding systems which are used as above.\n\
4120 \n\
4121 See also the function `find-operation-coding-system'.");
4122 Vprocess_coding_system_alist = Qnil;
4123
4124 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4125 "Alist to decide a coding system to use for a network I/O operation.\n\
4126 The format is ((PATTERN . VAL) ...),\n\
4127 where PATTERN is a regular expression matching a network service name\n\
4128 or is a port number to connect to,\n\
4129 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4130 If VAL is a coding system, it is used for both decoding what received\n\
4131 from the network stream and encoding what sent to the network stream.\n\
4132 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4133 and the cdr part is used for encoding.\n\
4134 If VAL is a function symbol, the function must return a coding system\n\
4135 or a cons of coding systems which are used as above.\n\
4136 \n\
4137 See also the function `find-operation-coding-system'.");
4138 Vnetwork_coding_system_alist = Qnil;
4139
4140 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4141 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4142 eol_mnemonic_unix = ':';
4143
4144 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4145 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4146 eol_mnemonic_dos = '\\';
4147
4148 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4149 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4150 eol_mnemonic_mac = '/';
4151
4152 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4153 "Mnemonic character indicating end-of-line format is not yet decided.");
4154 eol_mnemonic_undecided = ':';
4155
4156 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4157 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4158 Venable_character_unification = Qt;
4159
4160 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4161 &Vstandard_character_unification_table_for_decode,
4162 "Table for unifying characters when reading.");
4163 Vstandard_character_unification_table_for_decode = Qnil;
4164
4165 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4166 &Vstandard_character_unification_table_for_encode,
4167 "Table for unifying characters when writing.");
4168 Vstandard_character_unification_table_for_encode = Qnil;
4169
4170 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4171 "Alist of charsets vs revision numbers.\n\
4172 While encoding, if a charset (car part of an element) is found,\n\
4173 designate it with the escape sequence identifing revision (cdr part of the element).");
4174 Vcharset_revision_alist = Qnil;
4175
4176 DEFVAR_LISP ("default-process-coding-system",
4177 &Vdefault_process_coding_system,
4178 "Cons of coding systems used for process I/O by default.\n\
4179 The car part is used for decoding a process output,\n\
4180 the cdr part is used for encoding a text to be sent to a process.");
4181 Vdefault_process_coding_system = Qnil;
4182
4183 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4184 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4185 This is a vector of length 256.\n\
4186 If Nth element is non-nil, the existence of code N in a file\n\
4187 \(or output of subprocess) doesn't prevent it to be detected as\n\
4188 a coding system of ISO 2022 variant which has a flag\n\
4189 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4190 or reading output of a subprocess.\n\
4191 Only 128th through 159th elements has a meaning.");
4192 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4193 }
4194
4195 #endif /* emacs */