]> code.delx.au - gnu-emacs/blob - src/coding.c
Copyright up-date.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
33
34 */
35
36 /*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
44
45 0. Emacs' internal format (emacs-mule)
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 section 4.
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
70
71 4. Raw text
72
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
75
76 5. Other
77
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
87
88 */
89
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
97
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
102
103 */
104
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112 #if 0
113 int
114 detect_coding_emacs_mule (src, src_end)
115 unsigned char *src, *src_end;
116 {
117 ...
118 }
119 #endif
120
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
138 #if 0
139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
143 {
144 ...
145 }
146 #endif
147
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
165 #if 0
166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
170 {
171 ...
172 }
173 #endif
174
175 /*** COMMONLY USED MACROS ***/
176
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183 #define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191 #define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199 #define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214 /* Decode one ASCII character C. */
215
216 #define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 *dst++ = (c) & 0x7F; \
219 coding->produced_char++; \
220 } while (0)
221
222 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
223 position-code is C. */
224
225 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
226 do { \
227 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
228 \
229 *dst++ = leading_code; \
230 if ((leading_code = CHARSET_LEADING_CODE_EXT (charset)) > 0) \
231 *dst++ = leading_code; \
232 *dst++ = (c) | 0x80; \
233 coding->produced_char++; \
234 } while (0)
235
236 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
237 position-codes are C1 and C2. */
238
239 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
240 do { \
241 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
242 *dst++ = (c2) | 0x80; \
243 } while (0)
244
245 \f
246 /*** 1. Preamble ***/
247
248 #ifdef emacs
249 #include <config.h>
250 #endif
251
252 #include <stdio.h>
253
254 #ifdef emacs
255
256 #include "lisp.h"
257 #include "buffer.h"
258 #include "charset.h"
259 #include "composite.h"
260 #include "ccl.h"
261 #include "coding.h"
262 #include "window.h"
263
264 #else /* not emacs */
265
266 #include "mulelib.h"
267
268 #endif /* not emacs */
269
270 Lisp_Object Qcoding_system, Qeol_type;
271 Lisp_Object Qbuffer_file_coding_system;
272 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
273 Lisp_Object Qno_conversion, Qundecided;
274 Lisp_Object Qcoding_system_history;
275 Lisp_Object Qsafe_charsets;
276 Lisp_Object Qvalid_codes;
277
278 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
279 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
280 Lisp_Object Qstart_process, Qopen_network_stream;
281 Lisp_Object Qtarget_idx;
282
283 Lisp_Object Vselect_safe_coding_system_function;
284
285 /* Mnemonic string for each format of end-of-line. */
286 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
287 /* Mnemonic string to indicate format of end-of-line is not yet
288 decided. */
289 Lisp_Object eol_mnemonic_undecided;
290
291 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
292 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
293 int system_eol_type;
294
295 #ifdef emacs
296
297 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
298
299 Lisp_Object Qcoding_system_p, Qcoding_system_error;
300
301 /* Coding system emacs-mule and raw-text are for converting only
302 end-of-line format. */
303 Lisp_Object Qemacs_mule, Qraw_text;
304
305 /* Coding-systems are handed between Emacs Lisp programs and C internal
306 routines by the following three variables. */
307 /* Coding-system for reading files and receiving data from process. */
308 Lisp_Object Vcoding_system_for_read;
309 /* Coding-system for writing files and sending data to process. */
310 Lisp_Object Vcoding_system_for_write;
311 /* Coding-system actually used in the latest I/O. */
312 Lisp_Object Vlast_coding_system_used;
313
314 /* A vector of length 256 which contains information about special
315 Latin codes (especially for dealing with Microsoft codes). */
316 Lisp_Object Vlatin_extra_code_table;
317
318 /* Flag to inhibit code conversion of end-of-line format. */
319 int inhibit_eol_conversion;
320
321 /* Flag to make buffer-file-coding-system inherit from process-coding. */
322 int inherit_process_coding_system;
323
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding;
326
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding;
330
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding;
333
334 /* Default coding system to be used to write a file. */
335 struct coding_system default_buffer_file_coding;
336
337 Lisp_Object Vfile_coding_system_alist;
338 Lisp_Object Vprocess_coding_system_alist;
339 Lisp_Object Vnetwork_coding_system_alist;
340
341 Lisp_Object Vlocale_coding_system;
342
343 #endif /* emacs */
344
345 Lisp_Object Qcoding_category, Qcoding_category_index;
346
347 /* List of symbols `coding-category-xxx' ordered by priority. */
348 Lisp_Object Vcoding_category_list;
349
350 /* Table of coding categories (Lisp symbols). */
351 Lisp_Object Vcoding_category_table;
352
353 /* Table of names of symbol for each coding-category. */
354 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
355 "coding-category-emacs-mule",
356 "coding-category-sjis",
357 "coding-category-iso-7",
358 "coding-category-iso-7-tight",
359 "coding-category-iso-8-1",
360 "coding-category-iso-8-2",
361 "coding-category-iso-7-else",
362 "coding-category-iso-8-else",
363 "coding-category-ccl",
364 "coding-category-big5",
365 "coding-category-raw-text",
366 "coding-category-binary"
367 };
368
369 /* Table of pointers to coding systems corresponding to each coding
370 categories. */
371 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
372
373 /* Table of coding category masks. Nth element is a mask for a coding
374 cateogry of which priority is Nth. */
375 static
376 int coding_priorities[CODING_CATEGORY_IDX_MAX];
377
378 /* Flag to tell if we look up translation table on character code
379 conversion. */
380 Lisp_Object Venable_character_translation;
381 /* Standard translation table to look up on decoding (reading). */
382 Lisp_Object Vstandard_translation_table_for_decode;
383 /* Standard translation table to look up on encoding (writing). */
384 Lisp_Object Vstandard_translation_table_for_encode;
385
386 Lisp_Object Qtranslation_table;
387 Lisp_Object Qtranslation_table_id;
388 Lisp_Object Qtranslation_table_for_decode;
389 Lisp_Object Qtranslation_table_for_encode;
390
391 /* Alist of charsets vs revision number. */
392 Lisp_Object Vcharset_revision_alist;
393
394 /* Default coding systems used for process I/O. */
395 Lisp_Object Vdefault_process_coding_system;
396
397 /* Global flag to tell that we can't call post-read-conversion and
398 pre-write-conversion functions. Usually the value is zero, but it
399 is set to 1 temporarily while such functions are running. This is
400 to avoid infinite recursive call. */
401 static int inhibit_pre_post_conversion;
402
403 \f
404 /*** 2. Emacs internal format (emacs-mule) handlers ***/
405
406 /* Emacs' internal format for encoding multiple character sets is a
407 kind of multi-byte encoding, i.e. characters are encoded by
408 variable-length sequences of one-byte codes. ASCII characters
409 and control characters (e.g. `tab', `newline') are represented by
410 one-byte sequences which are their ASCII codes, in the range 0x00
411 through 0x7F. The other characters are represented by a sequence
412 of `base leading-code', optional `extended leading-code', and one
413 or two `position-code's. The length of the sequence is determined
414 by the base leading-code. Leading-code takes the range 0x80
415 through 0x9F, whereas extended leading-code and position-code take
416 the range 0xA0 through 0xFF. See `charset.h' for more details
417 about leading-code and position-code.
418
419 --- CODE RANGE of Emacs' internal format ---
420 (character set) (range)
421 ASCII 0x00 .. 0x7F
422 ELSE (1st byte) 0x81 .. 0x9F
423 (rest bytes) 0xA0 .. 0xFF
424 ---------------------------------------------
425
426 */
427
428 enum emacs_code_class_type emacs_code_class[256];
429
430 /* Go to the next statement only if *SRC is accessible and the code is
431 greater than 0xA0. */
432 #define CHECK_CODE_RANGE_A0_FF \
433 do { \
434 if (src >= src_end) \
435 goto label_end_of_switch; \
436 else if (*src++ < 0xA0) \
437 return 0; \
438 } while (0)
439
440 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
441 Check if a text is encoded in Emacs' internal format. If it is,
442 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
443
444 int
445 detect_coding_emacs_mule (src, src_end)
446 unsigned char *src, *src_end;
447 {
448 unsigned char c;
449 int composing = 0;
450
451 while (src < src_end)
452 {
453 c = *src++;
454
455 if (composing)
456 {
457 if (c < 0xA0)
458 composing = 0;
459 else
460 c -= 0x20;
461 }
462
463 switch (emacs_code_class[c])
464 {
465 case EMACS_ascii_code:
466 case EMACS_linefeed_code:
467 break;
468
469 case EMACS_control_code:
470 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
471 return 0;
472 break;
473
474 case EMACS_invalid_code:
475 return 0;
476
477 case EMACS_leading_code_4:
478 CHECK_CODE_RANGE_A0_FF;
479 /* fall down to check it two more times ... */
480
481 case EMACS_leading_code_3:
482 CHECK_CODE_RANGE_A0_FF;
483 /* fall down to check it one more time ... */
484
485 case EMACS_leading_code_2:
486 CHECK_CODE_RANGE_A0_FF;
487 break;
488
489 case 0x80: /* Old leading code for a composite character. */
490 if (composing)
491 CHECK_CODE_RANGE_A0_FF;
492 else
493 composing = 1;
494 break;
495
496 default:
497 label_end_of_switch:
498 break;
499 }
500 }
501 return CODING_CATEGORY_MASK_EMACS_MULE;
502 }
503
504 \f
505 /*** 3. ISO2022 handlers ***/
506
507 /* The following note describes the coding system ISO2022 briefly.
508 Since the intention of this note is to help understand the
509 functions in this file, some parts are NOT ACCURATE or OVERLY
510 SIMPLIFIED. For thorough understanding, please refer to the
511 original document of ISO2022.
512
513 ISO2022 provides many mechanisms to encode several character sets
514 in 7-bit and 8-bit environments. For 7-bite environments, all text
515 is encoded using bytes less than 128. This may make the encoded
516 text a little bit longer, but the text passes more easily through
517 several gateways, some of which strip off MSB (Most Signigant Bit).
518
519 There are two kinds of character sets: control character set and
520 graphic character set. The former contains control characters such
521 as `newline' and `escape' to provide control functions (control
522 functions are also provided by escape sequences). The latter
523 contains graphic characters such as 'A' and '-'. Emacs recognizes
524 two control character sets and many graphic character sets.
525
526 Graphic character sets are classified into one of the following
527 four classes, according to the number of bytes (DIMENSION) and
528 number of characters in one dimension (CHARS) of the set:
529 - DIMENSION1_CHARS94
530 - DIMENSION1_CHARS96
531 - DIMENSION2_CHARS94
532 - DIMENSION2_CHARS96
533
534 In addition, each character set is assigned an identification tag,
535 unique for each set, called "final character" (denoted as <F>
536 hereafter). The <F> of each character set is decided by ECMA(*)
537 when it is registered in ISO. The code range of <F> is 0x30..0x7F
538 (0x30..0x3F are for private use only).
539
540 Note (*): ECMA = European Computer Manufacturers Association
541
542 Here are examples of graphic character set [NAME(<F>)]:
543 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
544 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
545 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
546 o DIMENSION2_CHARS96 -- none for the moment
547
548 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
549 C0 [0x00..0x1F] -- control character plane 0
550 GL [0x20..0x7F] -- graphic character plane 0
551 C1 [0x80..0x9F] -- control character plane 1
552 GR [0xA0..0xFF] -- graphic character plane 1
553
554 A control character set is directly designated and invoked to C0 or
555 C1 by an escape sequence. The most common case is that:
556 - ISO646's control character set is designated/invoked to C0, and
557 - ISO6429's control character set is designated/invoked to C1,
558 and usually these designations/invocations are omitted in encoded
559 text. In a 7-bit environment, only C0 can be used, and a control
560 character for C1 is encoded by an appropriate escape sequence to
561 fit into the environment. All control characters for C1 are
562 defined to have corresponding escape sequences.
563
564 A graphic character set is at first designated to one of four
565 graphic registers (G0 through G3), then these graphic registers are
566 invoked to GL or GR. These designations and invocations can be
567 done independently. The most common case is that G0 is invoked to
568 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
569 these invocations and designations are omitted in encoded text.
570 In a 7-bit environment, only GL can be used.
571
572 When a graphic character set of CHARS94 is invoked to GL, codes
573 0x20 and 0x7F of the GL area work as control characters SPACE and
574 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
575 be used.
576
577 There are two ways of invocation: locking-shift and single-shift.
578 With locking-shift, the invocation lasts until the next different
579 invocation, whereas with single-shift, the invocation affects the
580 following character only and doesn't affect the locking-shift
581 state. Invocations are done by the following control characters or
582 escape sequences:
583
584 ----------------------------------------------------------------------
585 abbrev function cntrl escape seq description
586 ----------------------------------------------------------------------
587 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
588 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
589 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
590 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
591 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
592 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
593 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
594 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
595 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
596 ----------------------------------------------------------------------
597 (*) These are not used by any known coding system.
598
599 Control characters for these functions are defined by macros
600 ISO_CODE_XXX in `coding.h'.
601
602 Designations are done by the following escape sequences:
603 ----------------------------------------------------------------------
604 escape sequence description
605 ----------------------------------------------------------------------
606 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
607 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
608 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
609 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
610 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
611 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
612 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
613 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
614 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
615 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
616 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
617 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
618 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
619 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
620 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
621 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
622 ----------------------------------------------------------------------
623
624 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
625 of dimension 1, chars 94, and final character <F>, etc...
626
627 Note (*): Although these designations are not allowed in ISO2022,
628 Emacs accepts them on decoding, and produces them on encoding
629 CHARS96 character sets in a coding system which is characterized as
630 7-bit environment, non-locking-shift, and non-single-shift.
631
632 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
633 '(' can be omitted. We refer to this as "short-form" hereafter.
634
635 Now you may notice that there are a lot of ways for encoding the
636 same multilingual text in ISO2022. Actually, there exist many
637 coding systems such as Compound Text (used in X11's inter client
638 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
639 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
640 localized platforms), and all of these are variants of ISO2022.
641
642 In addition to the above, Emacs handles two more kinds of escape
643 sequences: ISO6429's direction specification and Emacs' private
644 sequence for specifying character composition.
645
646 ISO6429's direction specification takes the following form:
647 o CSI ']' -- end of the current direction
648 o CSI '0' ']' -- end of the current direction
649 o CSI '1' ']' -- start of left-to-right text
650 o CSI '2' ']' -- start of right-to-left text
651 The control character CSI (0x9B: control sequence introducer) is
652 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
653
654 Character composition specification takes the following form:
655 o ESC '0' -- start relative composition
656 o ESC '1' -- end composition
657 o ESC '2' -- start rule-base composition (*)
658 o ESC '3' -- start relative composition with alternate chars (**)
659 o ESC '4' -- start rule-base composition with alternate chars (**)
660 Since these are not standard escape sequences of any ISO standard,
661 the use of them for these meaning is restricted to Emacs only.
662
663 (*) This form is used only in Emacs 20.5 and the older versions,
664 but the newer versions can safely decode it.
665 (**) This form is used only in Emacs 21.1 and the newer versions,
666 and the older versions can't decode it.
667
668 Here's a list of examples usages of these composition escape
669 sequences (categorized by `enum composition_method').
670
671 COMPOSITION_RELATIVE:
672 ESC 0 CHAR [ CHAR ] ESC 1
673 COMPOSITOIN_WITH_RULE:
674 ESC 2 CHAR [ RULE CHAR ] ESC 1
675 COMPOSITION_WITH_ALTCHARS:
676 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
677 COMPOSITION_WITH_RULE_ALTCHARS:
678 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
679
680 enum iso_code_class_type iso_code_class[256];
681
682 #define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
688
689 #define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
696 CODING_CATEGORY_MASK_ISO_7_TIGHT
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704 int
705 detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707 {
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
710 int reg[4], shift_out = 0, single_shifting = 0;
711 int c, c1, i, charset;
712
713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
714 while (mask && src < src_end)
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
720 single_shifting = 0;
721 if (src >= src_end)
722 break;
723 c = *src++;
724 if (c >= '(' && c <= '/')
725 {
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
744 reg[0] = charset = iso_charset_table[1][0][c];
745 else if (c >= '(' && c <= '/')
746 {
747 if (src >= src_end)
748 break;
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
755 }
756 else
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
760 else if (c == 'N' || c == 'O')
761 {
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
764 break;
765 }
766 else if (c >= '0' && c <= '4')
767 {
768 /* ESC <Fp> for start/end composition. */
769 mask_found |= CODING_CATEGORY_MASK_ISO;
770 break;
771 }
772 else
773 /* Invalid escape sequence. Just ignore. */
774 break;
775
776 /* We found a valid designation sequence for CHARSET. */
777 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
778 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
779 mask_found |= CODING_CATEGORY_MASK_ISO_7;
780 else
781 mask &= ~CODING_CATEGORY_MASK_ISO_7;
782 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
783 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
784 else
785 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
786 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
787 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
788 else
789 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
790 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
791 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
792 else
793 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
794 break;
795
796 case ISO_CODE_SO:
797 single_shifting = 0;
798 if (shift_out == 0
799 && (reg[1] >= 0
800 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
801 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
802 {
803 /* Locking shift out. */
804 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
805 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
806 }
807 break;
808
809 case ISO_CODE_SI:
810 single_shifting = 0;
811 if (shift_out == 1)
812 {
813 /* Locking shift in. */
814 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
815 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
816 }
817 break;
818
819 case ISO_CODE_CSI:
820 single_shifting = 0;
821 case ISO_CODE_SS2:
822 case ISO_CODE_SS3:
823 {
824 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
825
826 if (c != ISO_CODE_CSI)
827 {
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
830 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
832 & CODING_FLAG_ISO_SINGLE_SHIFT)
833 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
834 single_shifting = 1;
835 }
836 if (VECTORP (Vlatin_extra_code_table)
837 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
838 {
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
841 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
842 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
843 & CODING_FLAG_ISO_LATIN_EXTRA)
844 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
845 }
846 mask &= newmask;
847 mask_found |= newmask;
848 }
849 break;
850
851 default:
852 if (c < 0x80)
853 {
854 single_shifting = 0;
855 break;
856 }
857 else if (c < 0xA0)
858 {
859 single_shifting = 0;
860 if (VECTORP (Vlatin_extra_code_table)
861 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
862 {
863 int newmask = 0;
864
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
867 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
868 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
869 & CODING_FLAG_ISO_LATIN_EXTRA)
870 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
871 mask &= newmask;
872 mask_found |= newmask;
873 }
874 else
875 return 0;
876 }
877 else
878 {
879 unsigned char *src_begin = src;
880
881 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
882 | CODING_CATEGORY_MASK_ISO_7_ELSE);
883 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
884 /* Check the length of succeeding codes of the range
885 0xA0..0FF. If the byte length is odd, we exclude
886 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
887 when we are not single shifting. */
888 if (!single_shifting)
889 {
890 while (src < src_end && *src >= 0xA0)
891 src++;
892 if ((src - src_begin - 1) & 1 && src < src_end)
893 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
894 else
895 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
896 }
897 }
898 break;
899 }
900 }
901
902 return (mask & mask_found);
903 }
904
905 /* Decode a character of which charset is CHARSET and the 1st position
906 code is C1. If dimension of CHARSET is 2, the 2nd position code is
907 fetched from SRC and set to C2. If CHARSET is negative, it means
908 that we are decoding ill formed text, and what we can do is just to
909 read C1 as is.
910
911 If we are now in the middle of composition sequence, the decoded
912 character may be ALTCHAR (see the comment above). In that case,
913 the character goes to coding->cmp_data->data instead of DST. */
914
915 #define DECODE_ISO_CHARACTER(charset, c1) \
916 do { \
917 int c_alt = -1, charset_alt = (charset); \
918 if (charset_alt >= 0) \
919 { \
920 if (CHARSET_DIMENSION (charset_alt) == 2) \
921 { \
922 ONE_MORE_BYTE (c2); \
923 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
924 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
925 { \
926 src--; \
927 charset_alt = CHARSET_ASCII; \
928 } \
929 } \
930 if (!NILP (translation_table) \
931 && ((c_alt = translate_char (translation_table, \
932 -1, charset_alt, c1, c2)) >= 0)) \
933 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
934 } \
935 if (! COMPOSING_P (coding) \
936 || coding->composing == COMPOSITION_RELATIVE \
937 || coding->composing == COMPOSITION_WITH_RULE) \
938 { \
939 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
940 DECODE_CHARACTER_ASCII (c1); \
941 else if (CHARSET_DIMENSION (charset_alt) == 1) \
942 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
943 else \
944 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
945 } \
946 if (COMPOSING_P (coding) \
947 && coding->composing != COMPOSITION_RELATIVE) \
948 { \
949 if (c_alt < 0) \
950 c_alt = MAKE_CHAR (charset_alt, c1, c2); \
951 CODING_ADD_COMPOSITION_COMPONENT (coding, c_alt); \
952 coding->composition_rule_follows \
953 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
954 } \
955 } while (0)
956
957 /* Set designation state into CODING. */
958 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
959 do { \
960 int charset; \
961 \
962 if (final_char < '0' || final_char >= 128) \
963 goto label_invalid_code; \
964 charset = ISO_CHARSET_TABLE (make_number (dimension), \
965 make_number (chars), \
966 make_number (final_char)); \
967 if (charset >= 0 \
968 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
969 || coding->safe_charsets[charset])) \
970 { \
971 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
972 && reg == 0 \
973 && charset == CHARSET_ASCII) \
974 { \
975 /* We should insert this designation sequence as is so \
976 that it is surely written back to a file. */ \
977 coding->spec.iso2022.last_invalid_designation_register = -1; \
978 goto label_invalid_code; \
979 } \
980 coding->spec.iso2022.last_invalid_designation_register = -1; \
981 if ((coding->mode & CODING_MODE_DIRECTION) \
982 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
983 charset = CHARSET_REVERSE_CHARSET (charset); \
984 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
985 } \
986 else \
987 { \
988 coding->spec.iso2022.last_invalid_designation_register = reg; \
989 goto label_invalid_code; \
990 } \
991 } while (0)
992
993 /* Allocate a memory block for storing information about compositions.
994 The block is chained to the already allocated blocks. */
995
996 static void
997 coding_allocate_composition_data (coding, char_offset)
998 struct coding_system *coding;
999 int char_offset;
1000 {
1001 struct composition_data *cmp_data
1002 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1003
1004 cmp_data->char_offset = char_offset;
1005 cmp_data->used = 0;
1006 cmp_data->prev = coding->cmp_data;
1007 cmp_data->next = NULL;
1008 if (coding->cmp_data)
1009 coding->cmp_data->next = cmp_data;
1010 coding->cmp_data = cmp_data;
1011 coding->cmp_data_start = 0;
1012 }
1013
1014 /* Record the starting position START and METHOD of one composition. */
1015
1016 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1017 do { \
1018 struct composition_data *cmp_data = coding->cmp_data; \
1019 int *data = cmp_data->data + cmp_data->used; \
1020 coding->cmp_data_start = cmp_data->used; \
1021 data[0] = -1; \
1022 data[1] = cmp_data->char_offset + start; \
1023 data[3] = (int) method; \
1024 cmp_data->used += 4; \
1025 } while (0)
1026
1027 /* Record the ending position END of the current composition. */
1028
1029 #define CODING_ADD_COMPOSITION_END(coding, end) \
1030 do { \
1031 struct composition_data *cmp_data = coding->cmp_data; \
1032 int *data = cmp_data->data + coding->cmp_data_start; \
1033 data[0] = cmp_data->used - coding->cmp_data_start; \
1034 data[2] = cmp_data->char_offset + end; \
1035 } while (0)
1036
1037 /* Record one COMPONENT (alternate character or composition rule). */
1038
1039 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1040 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1041
1042 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1043
1044 #define DECODE_COMPOSITION_START(c1) \
1045 do { \
1046 if (coding->composing == COMPOSITION_DISABLED) \
1047 { \
1048 *dst++ = ISO_CODE_ESC; \
1049 *dst++ = c1 & 0x7f; \
1050 coding->produced_char += 2; \
1051 } \
1052 else if (!COMPOSING_P (coding)) \
1053 { \
1054 /* This is surely the start of a composition. We must be sure \
1055 that coding->cmp_data has enough space to store the \
1056 information about the composition. If not, terminate the \
1057 current decoding loop, allocate one more memory block for \
1058 coding->cmp_data in the calller, then start the decoding \
1059 loop again. We can't allocate memory here directly because \
1060 it may cause buffer/string relocation. */ \
1061 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1062 >= COMPOSITION_DATA_SIZE) \
1063 { \
1064 result = CODING_FINISH_INSUFFICIENT_CMP; \
1065 goto label_end_of_loop_2; \
1066 } \
1067 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1068 : c1 == '2' ? COMPOSITION_WITH_RULE \
1069 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1070 : COMPOSITION_WITH_RULE_ALTCHARS); \
1071 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1072 coding->composing); \
1073 coding->composition_rule_follows = 0; \
1074 } \
1075 else \
1076 { \
1077 /* We are already handling a composition. If the method is \
1078 the following two, the codes following the current escape \
1079 sequence are actual characters stored in a buffer. */ \
1080 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1081 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1082 { \
1083 coding->composing = COMPOSITION_RELATIVE; \
1084 coding->composition_rule_follows = 0; \
1085 } \
1086 } \
1087 } while (0)
1088
1089 /* Handle compositoin end sequence ESC 1. */
1090
1091 #define DECODE_COMPOSITION_END(c1) \
1092 do { \
1093 if (coding->composing == COMPOSITION_DISABLED) \
1094 { \
1095 *dst++ = ISO_CODE_ESC; \
1096 *dst++ = c1; \
1097 coding->produced_char += 2; \
1098 } \
1099 else \
1100 { \
1101 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1102 coding->composing = COMPOSITION_NO; \
1103 } \
1104 } while (0)
1105
1106 /* Decode a composition rule from the byte C1 (and maybe one more byte
1107 from SRC) and store one encoded composition rule in
1108 coding->cmp_data. */
1109
1110 #define DECODE_COMPOSITION_RULE(c1) \
1111 do { \
1112 int rule = 0; \
1113 (c1) -= 32; \
1114 if (c1 < 81) /* old format (before ver.21) */ \
1115 { \
1116 int gref = (c1) / 9; \
1117 int nref = (c1) % 9; \
1118 if (gref == 4) gref = 10; \
1119 if (nref == 4) nref = 10; \
1120 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1121 } \
1122 else if (c1 < 93) /* new format (after ver.21 */ \
1123 { \
1124 ONE_MORE_BYTE (c2); \
1125 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1126 } \
1127 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1128 coding->composition_rule_follows = 0; \
1129 } while (0)
1130
1131
1132 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1133
1134 int
1135 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1136 struct coding_system *coding;
1137 unsigned char *source, *destination;
1138 int src_bytes, dst_bytes;
1139 {
1140 unsigned char *src = source;
1141 unsigned char *src_end = source + src_bytes;
1142 unsigned char *dst = destination;
1143 unsigned char *dst_end = destination + dst_bytes;
1144 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1145 from DST_END to assure that overflow checking is necessary only
1146 at the head of loop. */
1147 unsigned char *adjusted_dst_end = dst_end - 6;
1148 int charset;
1149 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1150 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1151 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1152 Lisp_Object translation_table
1153 = coding->translation_table_for_decode;
1154 int result = CODING_FINISH_NORMAL;
1155
1156 if (!NILP (Venable_character_translation) && NILP (translation_table))
1157 translation_table = Vstandard_translation_table_for_decode;
1158
1159 coding->produced_char = 0;
1160 coding->fake_multibyte = 0;
1161 while (src < src_end && (dst_bytes
1162 ? (dst < adjusted_dst_end)
1163 : (dst < src - 6)))
1164 {
1165 /* SRC_BASE remembers the start position in source in each loop.
1166 The loop will be exited when there's not enough source text
1167 to analyze long escape sequence or 2-byte code (within macros
1168 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1169 to SRC_BASE before exiting. */
1170 unsigned char *src_base = src;
1171 int c1 = *src++, c2;
1172
1173 /* We produce no character or one character. */
1174 switch (iso_code_class [c1])
1175 {
1176 case ISO_0x20_or_0x7F:
1177 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1178 {
1179 DECODE_COMPOSITION_RULE (c1);
1180 break;
1181 }
1182 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1183 {
1184 /* This is SPACE or DEL. */
1185 *dst++ = c1;
1186 coding->produced_char++;
1187 break;
1188 }
1189 /* This is a graphic character, we fall down ... */
1190
1191 case ISO_graphic_plane_0:
1192 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1193 DECODE_COMPOSITION_RULE (c1);
1194 else
1195 DECODE_ISO_CHARACTER (charset0, c1);
1196 break;
1197
1198 case ISO_0xA0_or_0xFF:
1199 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1200 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1201 goto label_invalid_code;
1202 /* This is a graphic character, we fall down ... */
1203
1204 case ISO_graphic_plane_1:
1205 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1206 goto label_invalid_code;
1207 DECODE_ISO_CHARACTER (charset1, c1);
1208 break;
1209
1210 case ISO_control_code:
1211 if (COMPOSING_P (coding))
1212 DECODE_COMPOSITION_END ('1');
1213
1214 /* All ISO2022 control characters in this class have the
1215 same representation in Emacs internal format. */
1216 if (c1 == '\n'
1217 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1218 && (coding->eol_type == CODING_EOL_CR
1219 || coding->eol_type == CODING_EOL_CRLF))
1220 {
1221 result = CODING_FINISH_INCONSISTENT_EOL;
1222 goto label_end_of_loop_2;
1223 }
1224 *dst++ = c1;
1225 coding->produced_char++;
1226 break;
1227
1228 case ISO_carriage_return:
1229 if (COMPOSING_P (coding))
1230 DECODE_COMPOSITION_END ('1');
1231
1232 if (coding->eol_type == CODING_EOL_CR)
1233 *dst++ = '\n';
1234 else if (coding->eol_type == CODING_EOL_CRLF)
1235 {
1236 ONE_MORE_BYTE (c1);
1237 if (c1 == ISO_CODE_LF)
1238 *dst++ = '\n';
1239 else
1240 {
1241 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1242 {
1243 result = CODING_FINISH_INCONSISTENT_EOL;
1244 goto label_end_of_loop_2;
1245 }
1246 src--;
1247 *dst++ = '\r';
1248 }
1249 }
1250 else
1251 *dst++ = c1;
1252 coding->produced_char++;
1253 break;
1254
1255 case ISO_shift_out:
1256 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1257 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1258 goto label_invalid_code;
1259 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1260 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1261 break;
1262
1263 case ISO_shift_in:
1264 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1265 goto label_invalid_code;
1266 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1267 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1268 break;
1269
1270 case ISO_single_shift_2_7:
1271 case ISO_single_shift_2:
1272 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1273 goto label_invalid_code;
1274 /* SS2 is handled as an escape sequence of ESC 'N' */
1275 c1 = 'N';
1276 goto label_escape_sequence;
1277
1278 case ISO_single_shift_3:
1279 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1280 goto label_invalid_code;
1281 /* SS2 is handled as an escape sequence of ESC 'O' */
1282 c1 = 'O';
1283 goto label_escape_sequence;
1284
1285 case ISO_control_sequence_introducer:
1286 /* CSI is handled as an escape sequence of ESC '[' ... */
1287 c1 = '[';
1288 goto label_escape_sequence;
1289
1290 case ISO_escape:
1291 ONE_MORE_BYTE (c1);
1292 label_escape_sequence:
1293 /* Escape sequences handled by Emacs are invocation,
1294 designation, direction specification, and character
1295 composition specification. */
1296 switch (c1)
1297 {
1298 case '&': /* revision of following character set */
1299 ONE_MORE_BYTE (c1);
1300 if (!(c1 >= '@' && c1 <= '~'))
1301 goto label_invalid_code;
1302 ONE_MORE_BYTE (c1);
1303 if (c1 != ISO_CODE_ESC)
1304 goto label_invalid_code;
1305 ONE_MORE_BYTE (c1);
1306 goto label_escape_sequence;
1307
1308 case '$': /* designation of 2-byte character set */
1309 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1310 goto label_invalid_code;
1311 ONE_MORE_BYTE (c1);
1312 if (c1 >= '@' && c1 <= 'B')
1313 { /* designation of JISX0208.1978, GB2312.1980,
1314 or JISX0208.1980 */
1315 DECODE_DESIGNATION (0, 2, 94, c1);
1316 }
1317 else if (c1 >= 0x28 && c1 <= 0x2B)
1318 { /* designation of DIMENSION2_CHARS94 character set */
1319 ONE_MORE_BYTE (c2);
1320 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1321 }
1322 else if (c1 >= 0x2C && c1 <= 0x2F)
1323 { /* designation of DIMENSION2_CHARS96 character set */
1324 ONE_MORE_BYTE (c2);
1325 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1326 }
1327 else
1328 goto label_invalid_code;
1329 break;
1330
1331 case 'n': /* invocation of locking-shift-2 */
1332 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1333 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1334 goto label_invalid_code;
1335 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1336 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1337 break;
1338
1339 case 'o': /* invocation of locking-shift-3 */
1340 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1341 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1342 goto label_invalid_code;
1343 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1344 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1345 break;
1346
1347 case 'N': /* invocation of single-shift-2 */
1348 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1349 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1350 goto label_invalid_code;
1351 ONE_MORE_BYTE (c1);
1352 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1353 DECODE_ISO_CHARACTER (charset, c1);
1354 break;
1355
1356 case 'O': /* invocation of single-shift-3 */
1357 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1358 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1359 goto label_invalid_code;
1360 ONE_MORE_BYTE (c1);
1361 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1362 DECODE_ISO_CHARACTER (charset, c1);
1363 break;
1364
1365 case '0': case '2': case '3': case '4': /* start composition */
1366 DECODE_COMPOSITION_START (c1);
1367 break;
1368
1369 case '1': /* end composition */
1370 DECODE_COMPOSITION_END (c1);
1371 break;
1372
1373 case '[': /* specification of direction */
1374 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1375 goto label_invalid_code;
1376 /* For the moment, nested direction is not supported.
1377 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1378 left-to-right, and nozero means right-to-left. */
1379 ONE_MORE_BYTE (c1);
1380 switch (c1)
1381 {
1382 case ']': /* end of the current direction */
1383 coding->mode &= ~CODING_MODE_DIRECTION;
1384
1385 case '0': /* end of the current direction */
1386 case '1': /* start of left-to-right direction */
1387 ONE_MORE_BYTE (c1);
1388 if (c1 == ']')
1389 coding->mode &= ~CODING_MODE_DIRECTION;
1390 else
1391 goto label_invalid_code;
1392 break;
1393
1394 case '2': /* start of right-to-left direction */
1395 ONE_MORE_BYTE (c1);
1396 if (c1 == ']')
1397 coding->mode |= CODING_MODE_DIRECTION;
1398 else
1399 goto label_invalid_code;
1400 break;
1401
1402 default:
1403 goto label_invalid_code;
1404 }
1405 break;
1406
1407 default:
1408 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1409 goto label_invalid_code;
1410 if (c1 >= 0x28 && c1 <= 0x2B)
1411 { /* designation of DIMENSION1_CHARS94 character set */
1412 ONE_MORE_BYTE (c2);
1413 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1414 }
1415 else if (c1 >= 0x2C && c1 <= 0x2F)
1416 { /* designation of DIMENSION1_CHARS96 character set */
1417 ONE_MORE_BYTE (c2);
1418 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1419 }
1420 else
1421 {
1422 goto label_invalid_code;
1423 }
1424 }
1425 /* We must update these variables now. */
1426 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1427 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1428 break;
1429
1430 label_invalid_code:
1431 if (COMPOSING_P (coding))
1432 DECODE_COMPOSITION_END ('1');
1433 coding->produced_char += src - src_base;
1434 while (src_base < src)
1435 *dst++ = (*src_base++) & 0x7F;
1436 }
1437 continue;
1438
1439 label_end_of_loop:
1440 result = CODING_FINISH_INSUFFICIENT_SRC;
1441 label_end_of_loop_2:
1442 src = src_base;
1443 break;
1444 }
1445
1446 if (src < src_end)
1447 {
1448 if (result == CODING_FINISH_NORMAL)
1449 result = CODING_FINISH_INSUFFICIENT_DST;
1450 else if (result != CODING_FINISH_INCONSISTENT_EOL
1451 && coding->mode & CODING_MODE_LAST_BLOCK)
1452 {
1453 /* This is the last block of the text to be decoded. We had
1454 better just flush out all remaining codes in the text
1455 although they are not valid characters. */
1456 if (COMPOSING_P (coding))
1457 DECODE_COMPOSITION_END ('1');
1458 src_bytes = src_end - src;
1459 if (dst_bytes && (dst_end - dst < src_end - src))
1460 src_end = src + (dst_end - dst);
1461 coding->produced_char += src_end - src;
1462 while (src < src_end)
1463 *dst++ = (*src++) & 0x7F;
1464 }
1465 }
1466
1467 coding->consumed = coding->consumed_char = src - source;
1468 coding->produced = dst - destination;
1469 return result;
1470 }
1471
1472 /* ISO2022 encoding stuff. */
1473
1474 /*
1475 It is not enough to say just "ISO2022" on encoding, we have to
1476 specify more details. In Emacs, each coding system of ISO2022
1477 variant has the following specifications:
1478 1. Initial designation to G0 thru G3.
1479 2. Allows short-form designation?
1480 3. ASCII should be designated to G0 before control characters?
1481 4. ASCII should be designated to G0 at end of line?
1482 5. 7-bit environment or 8-bit environment?
1483 6. Use locking-shift?
1484 7. Use Single-shift?
1485 And the following two are only for Japanese:
1486 8. Use ASCII in place of JIS0201-1976-Roman?
1487 9. Use JISX0208-1983 in place of JISX0208-1978?
1488 These specifications are encoded in `coding->flags' as flag bits
1489 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1490 details.
1491 */
1492
1493 /* Produce codes (escape sequence) for designating CHARSET to graphic
1494 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1495 the coding system CODING allows, produce designation sequence of
1496 short-form. */
1497
1498 #define ENCODE_DESIGNATION(charset, reg, coding) \
1499 do { \
1500 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1501 char *intermediate_char_94 = "()*+"; \
1502 char *intermediate_char_96 = ",-./"; \
1503 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1504 if (revision < 255) \
1505 { \
1506 *dst++ = ISO_CODE_ESC; \
1507 *dst++ = '&'; \
1508 *dst++ = '@' + revision; \
1509 } \
1510 *dst++ = ISO_CODE_ESC; \
1511 if (CHARSET_DIMENSION (charset) == 1) \
1512 { \
1513 if (CHARSET_CHARS (charset) == 94) \
1514 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1515 else \
1516 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1517 } \
1518 else \
1519 { \
1520 *dst++ = '$'; \
1521 if (CHARSET_CHARS (charset) == 94) \
1522 { \
1523 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1524 || reg != 0 \
1525 || final_char < '@' || final_char > 'B') \
1526 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1527 } \
1528 else \
1529 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1530 } \
1531 *dst++ = final_char; \
1532 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1533 } while (0)
1534
1535 /* The following two macros produce codes (control character or escape
1536 sequence) for ISO2022 single-shift functions (single-shift-2 and
1537 single-shift-3). */
1538
1539 #define ENCODE_SINGLE_SHIFT_2 \
1540 do { \
1541 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1542 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1543 else \
1544 { \
1545 *dst++ = ISO_CODE_SS2; \
1546 coding->fake_multibyte = 1; \
1547 } \
1548 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1549 } while (0)
1550
1551 #define ENCODE_SINGLE_SHIFT_3 \
1552 do { \
1553 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1554 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1555 else \
1556 { \
1557 *dst++ = ISO_CODE_SS3; \
1558 coding->fake_multibyte = 1; \
1559 } \
1560 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1561 } while (0)
1562
1563 /* The following four macros produce codes (control character or
1564 escape sequence) for ISO2022 locking-shift functions (shift-in,
1565 shift-out, locking-shift-2, and locking-shift-3). */
1566
1567 #define ENCODE_SHIFT_IN \
1568 do { \
1569 *dst++ = ISO_CODE_SI; \
1570 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1571 } while (0)
1572
1573 #define ENCODE_SHIFT_OUT \
1574 do { \
1575 *dst++ = ISO_CODE_SO; \
1576 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1577 } while (0)
1578
1579 #define ENCODE_LOCKING_SHIFT_2 \
1580 do { \
1581 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1582 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1583 } while (0)
1584
1585 #define ENCODE_LOCKING_SHIFT_3 \
1586 do { \
1587 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1588 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1589 } while (0)
1590
1591 /* Produce codes for a DIMENSION1 character whose character set is
1592 CHARSET and whose position-code is C1. Designation and invocation
1593 sequences are also produced in advance if necessary. */
1594
1595
1596 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1597 do { \
1598 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1599 { \
1600 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1601 *dst++ = c1 & 0x7F; \
1602 else \
1603 *dst++ = c1 | 0x80; \
1604 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1605 break; \
1606 } \
1607 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1608 { \
1609 *dst++ = c1 & 0x7F; \
1610 break; \
1611 } \
1612 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1613 { \
1614 *dst++ = c1 | 0x80; \
1615 break; \
1616 } \
1617 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1618 && !coding->safe_charsets[charset]) \
1619 { \
1620 /* We should not encode this character, instead produce one or \
1621 two `?'s. */ \
1622 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1623 if (CHARSET_WIDTH (charset) == 2) \
1624 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1625 break; \
1626 } \
1627 else \
1628 /* Since CHARSET is not yet invoked to any graphic planes, we \
1629 must invoke it, or, at first, designate it to some graphic \
1630 register. Then repeat the loop to actually produce the \
1631 character. */ \
1632 dst = encode_invocation_designation (charset, coding, dst); \
1633 } while (1)
1634
1635 /* Produce codes for a DIMENSION2 character whose character set is
1636 CHARSET and whose position-codes are C1 and C2. Designation and
1637 invocation codes are also produced in advance if necessary. */
1638
1639 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1640 do { \
1641 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1642 { \
1643 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1644 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1645 else \
1646 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1647 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1648 break; \
1649 } \
1650 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1651 { \
1652 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1653 break; \
1654 } \
1655 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1656 { \
1657 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1658 break; \
1659 } \
1660 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1661 && !coding->safe_charsets[charset]) \
1662 { \
1663 /* We should not encode this character, instead produce one or \
1664 two `?'s. */ \
1665 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1666 if (CHARSET_WIDTH (charset) == 2) \
1667 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1668 break; \
1669 } \
1670 else \
1671 /* Since CHARSET is not yet invoked to any graphic planes, we \
1672 must invoke it, or, at first, designate it to some graphic \
1673 register. Then repeat the loop to actually produce the \
1674 character. */ \
1675 dst = encode_invocation_designation (charset, coding, dst); \
1676 } while (1)
1677
1678 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1679 do { \
1680 int c_alt, charset_alt; \
1681 \
1682 if (!NILP (translation_table) \
1683 && ((c_alt = translate_char (translation_table, -1, \
1684 charset, c1, c2)) \
1685 >= 0)) \
1686 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1687 else \
1688 charset_alt = charset; \
1689 if (CHARSET_DEFINED_P (charset_alt)) \
1690 { \
1691 if (CHARSET_DIMENSION (charset_alt) == 1) \
1692 { \
1693 if (charset == CHARSET_ASCII \
1694 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1695 charset_alt = charset_latin_jisx0201; \
1696 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1697 } \
1698 else \
1699 { \
1700 if (charset == charset_jisx0208 \
1701 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1702 charset_alt = charset_jisx0208_1978; \
1703 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1704 } \
1705 } \
1706 else \
1707 { \
1708 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1709 { \
1710 *dst++ = charset & 0x7f; \
1711 *dst++ = c1 & 0x7f; \
1712 if (c2) \
1713 *dst++ = c2 & 0x7f; \
1714 } \
1715 else \
1716 { \
1717 *dst++ = charset; \
1718 *dst++ = c1; \
1719 if (c2) \
1720 *dst++ = c2; \
1721 } \
1722 } \
1723 coding->consumed_char++; \
1724 } while (0)
1725
1726 /* Produce designation and invocation codes at a place pointed by DST
1727 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1728 Return new DST. */
1729
1730 unsigned char *
1731 encode_invocation_designation (charset, coding, dst)
1732 int charset;
1733 struct coding_system *coding;
1734 unsigned char *dst;
1735 {
1736 int reg; /* graphic register number */
1737
1738 /* At first, check designations. */
1739 for (reg = 0; reg < 4; reg++)
1740 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1741 break;
1742
1743 if (reg >= 4)
1744 {
1745 /* CHARSET is not yet designated to any graphic registers. */
1746 /* At first check the requested designation. */
1747 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1748 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1749 /* Since CHARSET requests no special designation, designate it
1750 to graphic register 0. */
1751 reg = 0;
1752
1753 ENCODE_DESIGNATION (charset, reg, coding);
1754 }
1755
1756 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1757 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1758 {
1759 /* Since the graphic register REG is not invoked to any graphic
1760 planes, invoke it to graphic plane 0. */
1761 switch (reg)
1762 {
1763 case 0: /* graphic register 0 */
1764 ENCODE_SHIFT_IN;
1765 break;
1766
1767 case 1: /* graphic register 1 */
1768 ENCODE_SHIFT_OUT;
1769 break;
1770
1771 case 2: /* graphic register 2 */
1772 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1773 ENCODE_SINGLE_SHIFT_2;
1774 else
1775 ENCODE_LOCKING_SHIFT_2;
1776 break;
1777
1778 case 3: /* graphic register 3 */
1779 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1780 ENCODE_SINGLE_SHIFT_3;
1781 else
1782 ENCODE_LOCKING_SHIFT_3;
1783 break;
1784 }
1785 }
1786 return dst;
1787 }
1788
1789 /* Produce 2-byte codes for encoded composition rule RULE. */
1790
1791 #define ENCODE_COMPOSITION_RULE(rule) \
1792 do { \
1793 int gref, nref; \
1794 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1795 *dst++ = 32 + 81 + gref; \
1796 *dst++ = 32 + nref; \
1797 } while (0)
1798
1799 /* Produce codes for indicating the start of a composition sequence
1800 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1801 which specify information about the composition. See the comment
1802 in coding.h for the format of DATA. */
1803
1804 #define ENCODE_COMPOSITION_START(coding, data) \
1805 do { \
1806 coding->composing = data[3]; \
1807 *dst++ = ISO_CODE_ESC; \
1808 if (coding->composing == COMPOSITION_RELATIVE) \
1809 *dst++ = '0'; \
1810 else \
1811 { \
1812 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1813 ? '3' : '4'); \
1814 coding->cmp_data_index = coding->cmp_data_start + 4; \
1815 coding->composition_rule_follows = 0; \
1816 } \
1817 } while (0)
1818
1819 /* Produce codes for indicating the end of the current composition. */
1820
1821 #define ENCODE_COMPOSITION_END(coding, data) \
1822 do { \
1823 *dst++ = ISO_CODE_ESC; \
1824 *dst++ = '1'; \
1825 coding->cmp_data_start += data[0]; \
1826 coding->composing = COMPOSITION_NO; \
1827 if (coding->cmp_data_start == coding->cmp_data->used \
1828 && coding->cmp_data->next) \
1829 { \
1830 coding->cmp_data = coding->cmp_data->next; \
1831 coding->cmp_data_start = 0; \
1832 } \
1833 } while (0)
1834
1835 /* Produce composition start sequence ESC 0. Here, this sequence
1836 doesn't mean the start of a new composition but means that we have
1837 just produced components (alternate chars and composition rules) of
1838 the composition and the actual text follows in SRC. */
1839
1840 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1841 do { \
1842 *dst++ = ISO_CODE_ESC; \
1843 *dst++ = '0'; \
1844 coding->composing = COMPOSITION_RELATIVE; \
1845 } while (0)
1846
1847 /* The following three macros produce codes for indicating direction
1848 of text. */
1849 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1850 do { \
1851 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1852 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1853 else \
1854 *dst++ = ISO_CODE_CSI; \
1855 } while (0)
1856
1857 #define ENCODE_DIRECTION_R2L \
1858 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1859
1860 #define ENCODE_DIRECTION_L2R \
1861 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1862
1863 /* Produce codes for designation and invocation to reset the graphic
1864 planes and registers to initial state. */
1865 #define ENCODE_RESET_PLANE_AND_REGISTER \
1866 do { \
1867 int reg; \
1868 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1869 ENCODE_SHIFT_IN; \
1870 for (reg = 0; reg < 4; reg++) \
1871 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1872 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1873 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1874 ENCODE_DESIGNATION \
1875 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1876 } while (0)
1877
1878 /* Produce designation sequences of charsets in the line started from
1879 SRC to a place pointed by *DSTP, and update DSTP.
1880
1881 If the current block ends before any end-of-line, we may fail to
1882 find all the necessary designations. */
1883
1884 void
1885 encode_designation_at_bol (coding, table, src, src_end, dstp)
1886 struct coding_system *coding;
1887 Lisp_Object table;
1888 unsigned char *src, *src_end, **dstp;
1889 {
1890 int charset, c, found = 0, reg;
1891 /* Table of charsets to be designated to each graphic register. */
1892 int r[4];
1893 unsigned char *dst = *dstp;
1894
1895 for (reg = 0; reg < 4; reg++)
1896 r[reg] = -1;
1897
1898 while (src < src_end && *src != '\n' && found < 4)
1899 {
1900 int bytes = BYTES_BY_CHAR_HEAD (*src);
1901
1902 if (NILP (table))
1903 charset = CHARSET_AT (src);
1904 else
1905 {
1906 int c_alt;
1907 unsigned char c1, c2;
1908
1909 SPLIT_STRING(src, bytes, charset, c1, c2);
1910 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1911 charset = CHAR_CHARSET (c_alt);
1912 }
1913
1914 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1915 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1916 {
1917 found++;
1918 r[reg] = charset;
1919 }
1920
1921 src += bytes;
1922 }
1923
1924 if (found)
1925 {
1926 for (reg = 0; reg < 4; reg++)
1927 if (r[reg] >= 0
1928 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1929 ENCODE_DESIGNATION (r[reg], reg, coding);
1930 *dstp = dst;
1931 }
1932 }
1933
1934 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1935
1936 int
1937 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1938 struct coding_system *coding;
1939 unsigned char *source, *destination;
1940 int src_bytes, dst_bytes;
1941 {
1942 unsigned char *src = source;
1943 unsigned char *src_end = source + src_bytes;
1944 unsigned char *dst = destination;
1945 unsigned char *dst_end = destination + dst_bytes;
1946 /* Since the maximum bytes produced by each loop is 14, we subtract 13
1947 from DST_END to assure overflow checking is necessary only at the
1948 head of loop. */
1949 unsigned char *adjusted_dst_end = dst_end - 13;
1950 Lisp_Object translation_table
1951 = coding->translation_table_for_encode;
1952 int result = CODING_FINISH_NORMAL;
1953
1954 if (!NILP (Venable_character_translation) && NILP (translation_table))
1955 translation_table = Vstandard_translation_table_for_encode;
1956
1957 coding->consumed_char = 0;
1958 coding->fake_multibyte = 0;
1959 while (src < src_end && (dst_bytes
1960 ? (dst < adjusted_dst_end)
1961 : (dst < src - 13)))
1962 {
1963 /* SRC_BASE remembers the start position in source in each loop.
1964 The loop will be exited when there's not enough source text
1965 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1966 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1967 reset to SRC_BASE before exiting. */
1968 unsigned char *src_base = src;
1969 int charset, c1, c2, c3, c4;
1970
1971 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1972 && CODING_SPEC_ISO_BOL (coding))
1973 {
1974 /* We have to produce designation sequences if any now. */
1975 encode_designation_at_bol (coding, translation_table,
1976 src, src_end, &dst);
1977 CODING_SPEC_ISO_BOL (coding) = 0;
1978 }
1979
1980 /* Check composition start and end. */
1981 if (coding->composing != COMPOSITION_DISABLED
1982 && coding->cmp_data_start < coding->cmp_data->used)
1983 {
1984 struct composition_data *cmp_data = coding->cmp_data;
1985 int *data = cmp_data->data + coding->cmp_data_start;
1986 int this_pos = cmp_data->char_offset + coding->consumed_char;
1987
1988 if (coding->composing == COMPOSITION_RELATIVE)
1989 {
1990 if (this_pos == data[2])
1991 {
1992 ENCODE_COMPOSITION_END (coding, data);
1993 cmp_data = coding->cmp_data;
1994 data = cmp_data->data + coding->cmp_data_start;
1995 }
1996 }
1997 else if (COMPOSING_P (coding))
1998 {
1999 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2000 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2001 /* We have consumed components of the composition.
2002 What follows in SRC is the compositions's base
2003 text. */
2004 ENCODE_COMPOSITION_FAKE_START (coding);
2005 else
2006 {
2007 int c = cmp_data->data[coding->cmp_data_index++];
2008 if (coding->composition_rule_follows)
2009 {
2010 ENCODE_COMPOSITION_RULE (c);
2011 coding->composition_rule_follows = 0;
2012 }
2013 else
2014 {
2015 SPLIT_CHAR (c, charset, c1, c2);
2016 ENCODE_ISO_CHARACTER (charset, c1, c2);
2017 /* But, we didn't consume a character in SRC. */
2018 coding->consumed_char--;
2019 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2020 coding->composition_rule_follows = 1;
2021 }
2022 continue;
2023 }
2024 }
2025 if (!COMPOSING_P (coding))
2026 {
2027 if (this_pos == data[1])
2028 {
2029 ENCODE_COMPOSITION_START (coding, data);
2030 continue;
2031 }
2032 }
2033 }
2034
2035 c1 = *src++;
2036 /* Now encode one character. C1 is a control character, an
2037 ASCII character, or a leading-code of multi-byte character. */
2038 switch (emacs_code_class[c1])
2039 {
2040 case EMACS_ascii_code:
2041 c2 = 0;
2042 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
2043 break;
2044
2045 case EMACS_control_code:
2046 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2047 ENCODE_RESET_PLANE_AND_REGISTER;
2048 *dst++ = c1;
2049 coding->consumed_char++;
2050 break;
2051
2052 case EMACS_carriage_return_code:
2053 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2054 {
2055 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2056 ENCODE_RESET_PLANE_AND_REGISTER;
2057 *dst++ = c1;
2058 coding->consumed_char++;
2059 break;
2060 }
2061 /* fall down to treat '\r' as '\n' ... */
2062
2063 case EMACS_linefeed_code:
2064 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2065 ENCODE_RESET_PLANE_AND_REGISTER;
2066 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2067 bcopy (coding->spec.iso2022.initial_designation,
2068 coding->spec.iso2022.current_designation,
2069 sizeof coding->spec.iso2022.initial_designation);
2070 if (coding->eol_type == CODING_EOL_LF
2071 || coding->eol_type == CODING_EOL_UNDECIDED)
2072 *dst++ = ISO_CODE_LF;
2073 else if (coding->eol_type == CODING_EOL_CRLF)
2074 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2075 else
2076 *dst++ = ISO_CODE_CR;
2077 CODING_SPEC_ISO_BOL (coding) = 1;
2078 coding->consumed_char++;
2079 break;
2080
2081 case EMACS_leading_code_2:
2082 ONE_MORE_BYTE (c2);
2083 c3 = 0;
2084 if (c2 < 0xA0)
2085 {
2086 /* invalid sequence */
2087 *dst++ = c1;
2088 src--;
2089 coding->consumed_char++;
2090 }
2091 else
2092 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
2093 break;
2094
2095 case EMACS_leading_code_3:
2096 TWO_MORE_BYTES (c2, c3);
2097 c4 = 0;
2098 if (c2 < 0xA0 || c3 < 0xA0)
2099 {
2100 /* invalid sequence */
2101 *dst++ = c1;
2102 src -= 2;
2103 coding->consumed_char++;
2104 }
2105 else if (c1 < LEADING_CODE_PRIVATE_11)
2106 ENCODE_ISO_CHARACTER (c1, c2, c3);
2107 else
2108 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
2109 break;
2110
2111 case EMACS_leading_code_4:
2112 THREE_MORE_BYTES (c2, c3, c4);
2113 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
2114 {
2115 /* invalid sequence */
2116 *dst++ = c1;
2117 src -= 3;
2118 coding->consumed_char++;
2119 }
2120 else
2121 ENCODE_ISO_CHARACTER (c2, c3, c4);
2122 break;
2123
2124 case EMACS_invalid_code:
2125 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2126 ENCODE_RESET_PLANE_AND_REGISTER;
2127 *dst++ = c1;
2128 coding->consumed_char++;
2129 break;
2130 }
2131 continue;
2132 label_end_of_loop:
2133 result = CODING_FINISH_INSUFFICIENT_SRC;
2134 src = src_base;
2135 break;
2136 }
2137
2138 if (src < src_end && result == CODING_FINISH_NORMAL)
2139 result = CODING_FINISH_INSUFFICIENT_DST;
2140
2141 /* If this is the last block of the text to be encoded, we must
2142 reset graphic planes and registers to the initial state, and
2143 flush out the carryover if any. */
2144 if (coding->mode & CODING_MODE_LAST_BLOCK)
2145 {
2146 ENCODE_RESET_PLANE_AND_REGISTER;
2147 if (COMPOSING_P (coding))
2148 *dst++ = ISO_CODE_ESC, *dst++ = '1';
2149 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2150 {
2151 while (src < src_end && dst < dst_end)
2152 *dst++ = *src++;
2153 }
2154 }
2155 coding->consumed = src - source;
2156 coding->produced = coding->produced_char = dst - destination;
2157 return result;
2158 }
2159
2160 \f
2161 /*** 4. SJIS and BIG5 handlers ***/
2162
2163 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2164 quite widely. So, for the moment, Emacs supports them in the bare
2165 C code. But, in the future, they may be supported only by CCL. */
2166
2167 /* SJIS is a coding system encoding three character sets: ASCII, right
2168 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2169 as is. A character of charset katakana-jisx0201 is encoded by
2170 "position-code + 0x80". A character of charset japanese-jisx0208
2171 is encoded in 2-byte but two position-codes are divided and shifted
2172 so that it fit in the range below.
2173
2174 --- CODE RANGE of SJIS ---
2175 (character set) (range)
2176 ASCII 0x00 .. 0x7F
2177 KATAKANA-JISX0201 0xA0 .. 0xDF
2178 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2179 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2180 -------------------------------
2181
2182 */
2183
2184 /* BIG5 is a coding system encoding two character sets: ASCII and
2185 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2186 character set and is encoded in two-byte.
2187
2188 --- CODE RANGE of BIG5 ---
2189 (character set) (range)
2190 ASCII 0x00 .. 0x7F
2191 Big5 (1st byte) 0xA1 .. 0xFE
2192 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2193 --------------------------
2194
2195 Since the number of characters in Big5 is larger than maximum
2196 characters in Emacs' charset (96x96), it can't be handled as one
2197 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2198 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2199 contains frequently used characters and the latter contains less
2200 frequently used characters. */
2201
2202 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2203 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2204 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2205 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2206
2207 /* Number of Big5 characters which have the same code in 1st byte. */
2208 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2209
2210 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2211 do { \
2212 unsigned int temp \
2213 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2214 if (b1 < 0xC9) \
2215 charset = charset_big5_1; \
2216 else \
2217 { \
2218 charset = charset_big5_2; \
2219 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2220 } \
2221 c1 = temp / (0xFF - 0xA1) + 0x21; \
2222 c2 = temp % (0xFF - 0xA1) + 0x21; \
2223 } while (0)
2224
2225 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2226 do { \
2227 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2228 if (charset == charset_big5_2) \
2229 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2230 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2231 b2 = temp % BIG5_SAME_ROW; \
2232 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2233 } while (0)
2234
2235 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2236 do { \
2237 int c_alt, charset_alt = (charset); \
2238 if (!NILP (translation_table) \
2239 && ((c_alt = translate_char (translation_table, \
2240 -1, (charset), c1, c2)) >= 0)) \
2241 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2242 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2243 DECODE_CHARACTER_ASCII (c1); \
2244 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2245 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2246 else \
2247 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2248 } while (0)
2249
2250 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2251 do { \
2252 int c_alt, charset_alt; \
2253 if (!NILP (translation_table) \
2254 && ((c_alt = translate_char (translation_table, -1, \
2255 charset, c1, c2)) \
2256 >= 0)) \
2257 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2258 else \
2259 charset_alt = charset; \
2260 if (charset_alt == charset_ascii) \
2261 *dst++ = c1; \
2262 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2263 { \
2264 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2265 *dst++ = c1; \
2266 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2267 *dst++ = c1 & 0x7F; \
2268 else \
2269 { \
2270 *dst++ = charset_alt, *dst++ = c1; \
2271 coding->fake_multibyte = 1; \
2272 } \
2273 } \
2274 else \
2275 { \
2276 c1 &= 0x7F, c2 &= 0x7F; \
2277 if (sjis_p && (charset_alt == charset_jisx0208 \
2278 || charset_alt == charset_jisx0208_1978))\
2279 { \
2280 unsigned char s1, s2; \
2281 \
2282 ENCODE_SJIS (c1, c2, s1, s2); \
2283 *dst++ = s1, *dst++ = s2; \
2284 coding->fake_multibyte = 1; \
2285 } \
2286 else if (!sjis_p \
2287 && (charset_alt == charset_big5_1 \
2288 || charset_alt == charset_big5_2)) \
2289 { \
2290 unsigned char b1, b2; \
2291 \
2292 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2293 *dst++ = b1, *dst++ = b2; \
2294 } \
2295 else \
2296 { \
2297 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2298 coding->fake_multibyte = 1; \
2299 } \
2300 } \
2301 coding->consumed_char++; \
2302 } while (0)
2303
2304 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2305 Check if a text is encoded in SJIS. If it is, return
2306 CODING_CATEGORY_MASK_SJIS, else return 0. */
2307
2308 int
2309 detect_coding_sjis (src, src_end)
2310 unsigned char *src, *src_end;
2311 {
2312 unsigned char c;
2313
2314 while (src < src_end)
2315 {
2316 c = *src++;
2317 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2318 {
2319 if (src < src_end && *src++ < 0x40)
2320 return 0;
2321 }
2322 }
2323 return CODING_CATEGORY_MASK_SJIS;
2324 }
2325
2326 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2327 Check if a text is encoded in BIG5. If it is, return
2328 CODING_CATEGORY_MASK_BIG5, else return 0. */
2329
2330 int
2331 detect_coding_big5 (src, src_end)
2332 unsigned char *src, *src_end;
2333 {
2334 unsigned char c;
2335
2336 while (src < src_end)
2337 {
2338 c = *src++;
2339 if (c >= 0xA1)
2340 {
2341 if (src >= src_end)
2342 break;
2343 c = *src++;
2344 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2345 return 0;
2346 }
2347 }
2348 return CODING_CATEGORY_MASK_BIG5;
2349 }
2350
2351 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2352 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2353
2354 int
2355 decode_coding_sjis_big5 (coding, source, destination,
2356 src_bytes, dst_bytes, sjis_p)
2357 struct coding_system *coding;
2358 unsigned char *source, *destination;
2359 int src_bytes, dst_bytes;
2360 int sjis_p;
2361 {
2362 unsigned char *src = source;
2363 unsigned char *src_end = source + src_bytes;
2364 unsigned char *dst = destination;
2365 unsigned char *dst_end = destination + dst_bytes;
2366 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2367 from DST_END to assure overflow checking is necessary only at the
2368 head of loop. */
2369 unsigned char *adjusted_dst_end = dst_end - 3;
2370 Lisp_Object translation_table
2371 = coding->translation_table_for_decode;
2372 int result = CODING_FINISH_NORMAL;
2373
2374 if (!NILP (Venable_character_translation) && NILP (translation_table))
2375 translation_table = Vstandard_translation_table_for_decode;
2376
2377 coding->produced_char = 0;
2378 coding->fake_multibyte = 0;
2379 while (src < src_end && (dst_bytes
2380 ? (dst < adjusted_dst_end)
2381 : (dst < src - 3)))
2382 {
2383 /* SRC_BASE remembers the start position in source in each loop.
2384 The loop will be exited when there's not enough source text
2385 to analyze two-byte character (within macro ONE_MORE_BYTE).
2386 In that case, SRC is reset to SRC_BASE before exiting. */
2387 unsigned char *src_base = src;
2388 unsigned char c1 = *src++, c2, c3, c4;
2389
2390 if (c1 < 0x20)
2391 {
2392 if (c1 == '\r')
2393 {
2394 if (coding->eol_type == CODING_EOL_CRLF)
2395 {
2396 ONE_MORE_BYTE (c2);
2397 if (c2 == '\n')
2398 *dst++ = c2;
2399 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2400 {
2401 result = CODING_FINISH_INCONSISTENT_EOL;
2402 goto label_end_of_loop_2;
2403 }
2404 else
2405 /* To process C2 again, SRC is subtracted by 1. */
2406 *dst++ = c1, src--;
2407 }
2408 else if (coding->eol_type == CODING_EOL_CR)
2409 *dst++ = '\n';
2410 else
2411 *dst++ = c1;
2412 }
2413 else if (c1 == '\n'
2414 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2415 && (coding->eol_type == CODING_EOL_CR
2416 || coding->eol_type == CODING_EOL_CRLF))
2417 {
2418 result = CODING_FINISH_INCONSISTENT_EOL;
2419 goto label_end_of_loop_2;
2420 }
2421 else
2422 *dst++ = c1;
2423 coding->produced_char++;
2424 }
2425 else if (c1 < 0x80)
2426 {
2427 c2 = 0; /* avoid warning */
2428 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2429 }
2430 else
2431 {
2432 if (sjis_p)
2433 {
2434 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2435 {
2436 /* SJIS -> JISX0208 */
2437 ONE_MORE_BYTE (c2);
2438 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2439 {
2440 DECODE_SJIS (c1, c2, c3, c4);
2441 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2442 }
2443 else
2444 goto label_invalid_code_2;
2445 }
2446 else if (c1 < 0xE0)
2447 /* SJIS -> JISX0201-Kana */
2448 {
2449 c2 = 0; /* avoid warning */
2450 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2451 /* dummy */ c2);
2452 }
2453 else
2454 goto label_invalid_code_1;
2455 }
2456 else
2457 {
2458 /* BIG5 -> Big5 */
2459 if (c1 >= 0xA1 && c1 <= 0xFE)
2460 {
2461 ONE_MORE_BYTE (c2);
2462 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2463 {
2464 int charset;
2465
2466 DECODE_BIG5 (c1, c2, charset, c3, c4);
2467 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2468 }
2469 else
2470 goto label_invalid_code_2;
2471 }
2472 else
2473 goto label_invalid_code_1;
2474 }
2475 }
2476 continue;
2477
2478 label_invalid_code_1:
2479 *dst++ = c1;
2480 coding->produced_char++;
2481 coding->fake_multibyte = 1;
2482 continue;
2483
2484 label_invalid_code_2:
2485 *dst++ = c1; *dst++= c2;
2486 coding->produced_char += 2;
2487 coding->fake_multibyte = 1;
2488 continue;
2489
2490 label_end_of_loop:
2491 result = CODING_FINISH_INSUFFICIENT_SRC;
2492 label_end_of_loop_2:
2493 src = src_base;
2494 break;
2495 }
2496
2497 if (src < src_end)
2498 {
2499 if (result == CODING_FINISH_NORMAL)
2500 result = CODING_FINISH_INSUFFICIENT_DST;
2501 else if (result != CODING_FINISH_INCONSISTENT_EOL
2502 && coding->mode & CODING_MODE_LAST_BLOCK)
2503 {
2504 src_bytes = src_end - src;
2505 if (dst_bytes && (dst_end - dst < src_bytes))
2506 src_bytes = dst_end - dst;
2507 bcopy (dst, src, src_bytes);
2508 src += src_bytes;
2509 dst += src_bytes;
2510 coding->fake_multibyte = 1;
2511 }
2512 }
2513
2514 coding->consumed = coding->consumed_char = src - source;
2515 coding->produced = dst - destination;
2516 return result;
2517 }
2518
2519 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2520 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2521 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2522 sure that all these charsets are registered as official charset
2523 (i.e. do not have extended leading-codes). Characters of other
2524 charsets are produced without any encoding. If SJIS_P is 1, encode
2525 SJIS text, else encode BIG5 text. */
2526
2527 int
2528 encode_coding_sjis_big5 (coding, source, destination,
2529 src_bytes, dst_bytes, sjis_p)
2530 struct coding_system *coding;
2531 unsigned char *source, *destination;
2532 int src_bytes, dst_bytes;
2533 int sjis_p;
2534 {
2535 unsigned char *src = source;
2536 unsigned char *src_end = source + src_bytes;
2537 unsigned char *dst = destination;
2538 unsigned char *dst_end = destination + dst_bytes;
2539 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2540 from DST_END to assure overflow checking is necessary only at the
2541 head of loop. */
2542 unsigned char *adjusted_dst_end = dst_end - 1;
2543 Lisp_Object translation_table
2544 = coding->translation_table_for_encode;
2545 int result = CODING_FINISH_NORMAL;
2546
2547 if (!NILP (Venable_character_translation) && NILP (translation_table))
2548 translation_table = Vstandard_translation_table_for_encode;
2549
2550 coding->consumed_char = 0;
2551 coding->fake_multibyte = 0;
2552 while (src < src_end && (dst_bytes
2553 ? (dst < adjusted_dst_end)
2554 : (dst < src - 1)))
2555 {
2556 /* SRC_BASE remembers the start position in source in each loop.
2557 The loop will be exited when there's not enough source text
2558 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2559 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2560 before exiting. */
2561 unsigned char *src_base = src;
2562 unsigned char c1 = *src++, c2, c3, c4;
2563
2564 switch (emacs_code_class[c1])
2565 {
2566 case EMACS_ascii_code:
2567 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2568 break;
2569
2570 case EMACS_control_code:
2571 *dst++ = c1;
2572 coding->consumed_char++;
2573 break;
2574
2575 case EMACS_carriage_return_code:
2576 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2577 {
2578 *dst++ = c1;
2579 coding->consumed_char++;
2580 break;
2581 }
2582 /* fall down to treat '\r' as '\n' ... */
2583
2584 case EMACS_linefeed_code:
2585 if (coding->eol_type == CODING_EOL_LF
2586 || coding->eol_type == CODING_EOL_UNDECIDED)
2587 *dst++ = '\n';
2588 else if (coding->eol_type == CODING_EOL_CRLF)
2589 *dst++ = '\r', *dst++ = '\n';
2590 else
2591 *dst++ = '\r';
2592 coding->consumed_char++;
2593 break;
2594
2595 case EMACS_leading_code_2:
2596 ONE_MORE_BYTE (c2);
2597 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2598 break;
2599
2600 case EMACS_leading_code_3:
2601 TWO_MORE_BYTES (c2, c3);
2602 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2603 break;
2604
2605 case EMACS_leading_code_4:
2606 THREE_MORE_BYTES (c2, c3, c4);
2607 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2608 break;
2609
2610 default: /* i.e. case EMACS_invalid_code: */
2611 *dst++ = c1;
2612 coding->consumed_char++;
2613 }
2614 continue;
2615
2616 label_end_of_loop:
2617 result = CODING_FINISH_INSUFFICIENT_SRC;
2618 src = src_base;
2619 break;
2620 }
2621
2622 if (result == CODING_FINISH_NORMAL
2623 && src < src_end)
2624 result = CODING_FINISH_INSUFFICIENT_DST;
2625 coding->consumed = src - source;
2626 coding->produced = coding->produced_char = dst - destination;
2627 return result;
2628 }
2629
2630 \f
2631 /*** 5. CCL handlers ***/
2632
2633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2634 Check if a text is encoded in a coding system of which
2635 encoder/decoder are written in CCL program. If it is, return
2636 CODING_CATEGORY_MASK_CCL, else return 0. */
2637
2638 int
2639 detect_coding_ccl (src, src_end)
2640 unsigned char *src, *src_end;
2641 {
2642 unsigned char *valid;
2643
2644 /* No coding system is assigned to coding-category-ccl. */
2645 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2646 return 0;
2647
2648 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2649 while (src < src_end)
2650 {
2651 if (! valid[*src]) return 0;
2652 src++;
2653 }
2654 return CODING_CATEGORY_MASK_CCL;
2655 }
2656
2657 \f
2658 /*** 6. End-of-line handlers ***/
2659
2660 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2661 This function is called only when `coding->eol_type' is
2662 CODING_EOL_CRLF or CODING_EOL_CR. */
2663
2664 int
2665 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2666 struct coding_system *coding;
2667 unsigned char *source, *destination;
2668 int src_bytes, dst_bytes;
2669 {
2670 unsigned char *src = source;
2671 unsigned char *src_end = source + src_bytes;
2672 unsigned char *dst = destination;
2673 unsigned char *dst_end = destination + dst_bytes;
2674 unsigned char c;
2675 int result = CODING_FINISH_NORMAL;
2676
2677 coding->fake_multibyte = 0;
2678
2679 if (src_bytes <= 0)
2680 {
2681 coding->produced = coding->produced_char = 0;
2682 coding->consumed = coding->consumed_char = 0;
2683 return result;
2684 }
2685
2686 switch (coding->eol_type)
2687 {
2688 case CODING_EOL_CRLF:
2689 {
2690 /* Since the maximum bytes produced by each loop is 2, we
2691 subtract 1 from DST_END to assure overflow checking is
2692 necessary only at the head of loop. */
2693 unsigned char *adjusted_dst_end = dst_end - 1;
2694
2695 while (src < src_end && (dst_bytes
2696 ? (dst < adjusted_dst_end)
2697 : (dst < src - 1)))
2698 {
2699 unsigned char *src_base = src;
2700
2701 c = *src++;
2702 if (c == '\r')
2703 {
2704 ONE_MORE_BYTE (c);
2705 if (c == '\n')
2706 *dst++ = c;
2707 else
2708 {
2709 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2710 {
2711 result = CODING_FINISH_INCONSISTENT_EOL;
2712 goto label_end_of_loop_2;
2713 }
2714 src--;
2715 *dst++ = '\r';
2716 if (BASE_LEADING_CODE_P (c))
2717 coding->fake_multibyte = 1;
2718 }
2719 }
2720 else if (c == '\n'
2721 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2722 {
2723 result = CODING_FINISH_INCONSISTENT_EOL;
2724 goto label_end_of_loop_2;
2725 }
2726 else
2727 {
2728 *dst++ = c;
2729 if (BASE_LEADING_CODE_P (c))
2730 coding->fake_multibyte = 1;
2731 }
2732 continue;
2733
2734 label_end_of_loop:
2735 result = CODING_FINISH_INSUFFICIENT_SRC;
2736 label_end_of_loop_2:
2737 src = src_base;
2738 break;
2739 }
2740 if (src < src_end)
2741 {
2742 if (result == CODING_FINISH_NORMAL)
2743 result = CODING_FINISH_INSUFFICIENT_DST;
2744 else if (result != CODING_FINISH_INCONSISTENT_EOL
2745 && coding->mode & CODING_MODE_LAST_BLOCK)
2746 {
2747 /* This is the last block of the text to be decoded.
2748 We flush out all remaining codes. */
2749 src_bytes = src_end - src;
2750 if (dst_bytes && (dst_end - dst < src_bytes))
2751 src_bytes = dst_end - dst;
2752 bcopy (src, dst, src_bytes);
2753 dst += src_bytes;
2754 src += src_bytes;
2755 }
2756 }
2757 }
2758 break;
2759
2760 case CODING_EOL_CR:
2761 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2762 {
2763 while (src < src_end)
2764 {
2765 if ((c = *src++) == '\n')
2766 break;
2767 if (BASE_LEADING_CODE_P (c))
2768 coding->fake_multibyte = 1;
2769 }
2770 if (*--src == '\n')
2771 {
2772 src_bytes = src - source;
2773 result = CODING_FINISH_INCONSISTENT_EOL;
2774 }
2775 }
2776 if (dst_bytes && src_bytes > dst_bytes)
2777 {
2778 result = CODING_FINISH_INSUFFICIENT_DST;
2779 src_bytes = dst_bytes;
2780 }
2781 if (dst_bytes)
2782 bcopy (source, destination, src_bytes);
2783 else
2784 safe_bcopy (source, destination, src_bytes);
2785 src = source + src_bytes;
2786 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2787 break;
2788
2789 default: /* i.e. case: CODING_EOL_LF */
2790 if (dst_bytes && src_bytes > dst_bytes)
2791 {
2792 result = CODING_FINISH_INSUFFICIENT_DST;
2793 src_bytes = dst_bytes;
2794 }
2795 if (dst_bytes)
2796 bcopy (source, destination, src_bytes);
2797 else
2798 safe_bcopy (source, destination, src_bytes);
2799 src += src_bytes;
2800 dst += src_bytes;
2801 coding->fake_multibyte = 1;
2802 break;
2803 }
2804
2805 coding->consumed = coding->consumed_char = src - source;
2806 coding->produced = coding->produced_char = dst - destination;
2807 return result;
2808 }
2809
2810 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2811 format of end-of-line according to `coding->eol_type'. If
2812 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2813 '\r' in source text also means end-of-line. */
2814
2815 int
2816 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2817 struct coding_system *coding;
2818 unsigned char *source, *destination;
2819 int src_bytes, dst_bytes;
2820 {
2821 unsigned char *src = source;
2822 unsigned char *dst = destination;
2823 int result = CODING_FINISH_NORMAL;
2824
2825 coding->fake_multibyte = 0;
2826
2827 if (coding->eol_type == CODING_EOL_CRLF)
2828 {
2829 unsigned char c;
2830 unsigned char *src_end = source + src_bytes;
2831 unsigned char *dst_end = destination + dst_bytes;
2832 /* Since the maximum bytes produced by each loop is 2, we
2833 subtract 1 from DST_END to assure overflow checking is
2834 necessary only at the head of loop. */
2835 unsigned char *adjusted_dst_end = dst_end - 1;
2836
2837 while (src < src_end && (dst_bytes
2838 ? (dst < adjusted_dst_end)
2839 : (dst < src - 1)))
2840 {
2841 c = *src++;
2842 if (c == '\n'
2843 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2844 *dst++ = '\r', *dst++ = '\n';
2845 else
2846 {
2847 *dst++ = c;
2848 if (BASE_LEADING_CODE_P (c))
2849 coding->fake_multibyte = 1;
2850 }
2851 }
2852 if (src < src_end)
2853 result = CODING_FINISH_INSUFFICIENT_DST;
2854 }
2855 else
2856 {
2857 unsigned char c;
2858
2859 if (dst_bytes && src_bytes > dst_bytes)
2860 {
2861 src_bytes = dst_bytes;
2862 result = CODING_FINISH_INSUFFICIENT_DST;
2863 }
2864 if (dst_bytes)
2865 bcopy (source, destination, src_bytes);
2866 else
2867 safe_bcopy (source, destination, src_bytes);
2868 dst_bytes = src_bytes;
2869 if (coding->eol_type == CODING_EOL_CR)
2870 {
2871 while (src_bytes--)
2872 {
2873 if ((c = *dst++) == '\n')
2874 dst[-1] = '\r';
2875 else if (BASE_LEADING_CODE_P (c))
2876 coding->fake_multibyte = 1;
2877 }
2878 }
2879 else
2880 {
2881 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2882 {
2883 while (src_bytes--)
2884 if (*dst++ == '\r') dst[-1] = '\n';
2885 }
2886 coding->fake_multibyte = 1;
2887 }
2888 src = source + dst_bytes;
2889 dst = destination + dst_bytes;
2890 }
2891
2892 coding->consumed = coding->consumed_char = src - source;
2893 coding->produced = coding->produced_char = dst - destination;
2894 return result;
2895 }
2896
2897 \f
2898 /*** 7. C library functions ***/
2899
2900 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2901 has a property `coding-system'. The value of this property is a
2902 vector of length 5 (called as coding-vector). Among elements of
2903 this vector, the first (element[0]) and the fifth (element[4])
2904 carry important information for decoding/encoding. Before
2905 decoding/encoding, this information should be set in fields of a
2906 structure of type `coding_system'.
2907
2908 A value of property `coding-system' can be a symbol of another
2909 subsidiary coding-system. In that case, Emacs gets coding-vector
2910 from that symbol.
2911
2912 `element[0]' contains information to be set in `coding->type'. The
2913 value and its meaning is as follows:
2914
2915 0 -- coding_type_emacs_mule
2916 1 -- coding_type_sjis
2917 2 -- coding_type_iso2022
2918 3 -- coding_type_big5
2919 4 -- coding_type_ccl encoder/decoder written in CCL
2920 nil -- coding_type_no_conversion
2921 t -- coding_type_undecided (automatic conversion on decoding,
2922 no-conversion on encoding)
2923
2924 `element[4]' contains information to be set in `coding->flags' and
2925 `coding->spec'. The meaning varies by `coding->type'.
2926
2927 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2928 of length 32 (of which the first 13 sub-elements are used now).
2929 Meanings of these sub-elements are:
2930
2931 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2932 If the value is an integer of valid charset, the charset is
2933 assumed to be designated to graphic register N initially.
2934
2935 If the value is minus, it is a minus value of charset which
2936 reserves graphic register N, which means that the charset is
2937 not designated initially but should be designated to graphic
2938 register N just before encoding a character in that charset.
2939
2940 If the value is nil, graphic register N is never used on
2941 encoding.
2942
2943 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2944 Each value takes t or nil. See the section ISO2022 of
2945 `coding.h' for more information.
2946
2947 If `coding->type' is `coding_type_big5', element[4] is t to denote
2948 BIG5-ETen or nil to denote BIG5-HKU.
2949
2950 If `coding->type' takes the other value, element[4] is ignored.
2951
2952 Emacs Lisp's coding system also carries information about format of
2953 end-of-line in a value of property `eol-type'. If the value is
2954 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2955 means CODING_EOL_CR. If it is not integer, it should be a vector
2956 of subsidiary coding systems of which property `eol-type' has one
2957 of above values.
2958
2959 */
2960
2961 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2962 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2963 is setup so that no conversion is necessary and return -1, else
2964 return 0. */
2965
2966 int
2967 setup_coding_system (coding_system, coding)
2968 Lisp_Object coding_system;
2969 struct coding_system *coding;
2970 {
2971 Lisp_Object coding_spec, coding_type, eol_type, plist;
2972 Lisp_Object val;
2973 int i;
2974
2975 /* Initialize some fields required for all kinds of coding systems. */
2976 coding->symbol = coding_system;
2977 coding->common_flags = 0;
2978 coding->mode = 0;
2979 coding->heading_ascii = -1;
2980 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2981 coding->composing = COMPOSITION_DISABLED;
2982 coding->cmp_data = NULL;
2983
2984 if (NILP (coding_system))
2985 goto label_invalid_coding_system;
2986
2987 coding_spec = Fget (coding_system, Qcoding_system);
2988
2989 if (!VECTORP (coding_spec)
2990 || XVECTOR (coding_spec)->size != 5
2991 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2992 goto label_invalid_coding_system;
2993
2994 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2995 if (VECTORP (eol_type))
2996 {
2997 coding->eol_type = CODING_EOL_UNDECIDED;
2998 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2999 }
3000 else if (XFASTINT (eol_type) == 1)
3001 {
3002 coding->eol_type = CODING_EOL_CRLF;
3003 coding->common_flags
3004 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3005 }
3006 else if (XFASTINT (eol_type) == 2)
3007 {
3008 coding->eol_type = CODING_EOL_CR;
3009 coding->common_flags
3010 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3011 }
3012 else
3013 coding->eol_type = CODING_EOL_LF;
3014
3015 coding_type = XVECTOR (coding_spec)->contents[0];
3016 /* Try short cut. */
3017 if (SYMBOLP (coding_type))
3018 {
3019 if (EQ (coding_type, Qt))
3020 {
3021 coding->type = coding_type_undecided;
3022 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3023 }
3024 else
3025 coding->type = coding_type_no_conversion;
3026 return 0;
3027 }
3028
3029 /* Get values of coding system properties:
3030 `post-read-conversion', `pre-write-conversion',
3031 `translation-table-for-decode', `translation-table-for-encode'. */
3032 plist = XVECTOR (coding_spec)->contents[3];
3033 /* Pre & post conversion functions should be disabled if
3034 inhibit_eol_conversion is nozero. This is the case that a code
3035 conversion function is called while those functions are running. */
3036 if (! inhibit_pre_post_conversion)
3037 {
3038 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3039 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3040 }
3041 val = Fplist_get (plist, Qtranslation_table_for_decode);
3042 if (SYMBOLP (val))
3043 val = Fget (val, Qtranslation_table_for_decode);
3044 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3045 val = Fplist_get (plist, Qtranslation_table_for_encode);
3046 if (SYMBOLP (val))
3047 val = Fget (val, Qtranslation_table_for_encode);
3048 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3049 val = Fplist_get (plist, Qcoding_category);
3050 if (!NILP (val))
3051 {
3052 val = Fget (val, Qcoding_category_index);
3053 if (INTEGERP (val))
3054 coding->category_idx = XINT (val);
3055 else
3056 goto label_invalid_coding_system;
3057 }
3058 else
3059 goto label_invalid_coding_system;
3060
3061 val = Fplist_get (plist, Qsafe_charsets);
3062 if (EQ (val, Qt))
3063 {
3064 for (i = 0; i <= MAX_CHARSET; i++)
3065 coding->safe_charsets[i] = 1;
3066 }
3067 else
3068 {
3069 bzero (coding->safe_charsets, MAX_CHARSET + 1);
3070 while (CONSP (val))
3071 {
3072 if ((i = get_charset_id (XCAR (val))) >= 0)
3073 coding->safe_charsets[i] = 1;
3074 val = XCDR (val);
3075 }
3076 }
3077
3078 /* If the coding system has non-nil `composition' property, enable
3079 composition handling. */
3080 val = Fplist_get (plist, Qcomposition);
3081 if (!NILP (val))
3082 coding->composing = COMPOSITION_NO;
3083
3084 switch (XFASTINT (coding_type))
3085 {
3086 case 0:
3087 coding->type = coding_type_emacs_mule;
3088 if (!NILP (coding->post_read_conversion))
3089 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3090 if (!NILP (coding->pre_write_conversion))
3091 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3092 break;
3093
3094 case 1:
3095 coding->type = coding_type_sjis;
3096 coding->common_flags
3097 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3098 break;
3099
3100 case 2:
3101 coding->type = coding_type_iso2022;
3102 coding->common_flags
3103 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3104 {
3105 Lisp_Object val, temp;
3106 Lisp_Object *flags;
3107 int i, charset, reg_bits = 0;
3108
3109 val = XVECTOR (coding_spec)->contents[4];
3110
3111 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3112 goto label_invalid_coding_system;
3113
3114 flags = XVECTOR (val)->contents;
3115 coding->flags
3116 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3117 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3118 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3119 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3120 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3121 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3122 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3123 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3124 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3125 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3126 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3127 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3128 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3129 );
3130
3131 /* Invoke graphic register 0 to plane 0. */
3132 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3133 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3134 CODING_SPEC_ISO_INVOCATION (coding, 1)
3135 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3136 /* Not single shifting at first. */
3137 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3138 /* Beginning of buffer should also be regarded as bol. */
3139 CODING_SPEC_ISO_BOL (coding) = 1;
3140
3141 for (charset = 0; charset <= MAX_CHARSET; charset++)
3142 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3143 val = Vcharset_revision_alist;
3144 while (CONSP (val))
3145 {
3146 charset = get_charset_id (Fcar_safe (XCAR (val)));
3147 if (charset >= 0
3148 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3149 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3150 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3151 val = XCDR (val);
3152 }
3153
3154 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3155 FLAGS[REG] can be one of below:
3156 integer CHARSET: CHARSET occupies register I,
3157 t: designate nothing to REG initially, but can be used
3158 by any charsets,
3159 list of integer, nil, or t: designate the first
3160 element (if integer) to REG initially, the remaining
3161 elements (if integer) is designated to REG on request,
3162 if an element is t, REG can be used by any charsets,
3163 nil: REG is never used. */
3164 for (charset = 0; charset <= MAX_CHARSET; charset++)
3165 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3166 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3167 for (i = 0; i < 4; i++)
3168 {
3169 if (INTEGERP (flags[i])
3170 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3171 || (charset = get_charset_id (flags[i])) >= 0)
3172 {
3173 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3174 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3175 }
3176 else if (EQ (flags[i], Qt))
3177 {
3178 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3179 reg_bits |= 1 << i;
3180 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3181 }
3182 else if (CONSP (flags[i]))
3183 {
3184 Lisp_Object tail;
3185 tail = flags[i];
3186
3187 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3188 if (INTEGERP (XCAR (tail))
3189 && (charset = XINT (XCAR (tail)),
3190 CHARSET_VALID_P (charset))
3191 || (charset = get_charset_id (XCAR (tail))) >= 0)
3192 {
3193 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3194 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3195 }
3196 else
3197 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3198 tail = XCDR (tail);
3199 while (CONSP (tail))
3200 {
3201 if (INTEGERP (XCAR (tail))
3202 && (charset = XINT (XCAR (tail)),
3203 CHARSET_VALID_P (charset))
3204 || (charset = get_charset_id (XCAR (tail))) >= 0)
3205 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3206 = i;
3207 else if (EQ (XCAR (tail), Qt))
3208 reg_bits |= 1 << i;
3209 tail = XCDR (tail);
3210 }
3211 }
3212 else
3213 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3214
3215 CODING_SPEC_ISO_DESIGNATION (coding, i)
3216 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3217 }
3218
3219 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3220 {
3221 /* REG 1 can be used only by locking shift in 7-bit env. */
3222 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3223 reg_bits &= ~2;
3224 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3225 /* Without any shifting, only REG 0 and 1 can be used. */
3226 reg_bits &= 3;
3227 }
3228
3229 if (reg_bits)
3230 for (charset = 0; charset <= MAX_CHARSET; charset++)
3231 {
3232 if (CHARSET_VALID_P (charset))
3233 {
3234 /* There exist some default graphic registers to be
3235 used CHARSET. */
3236
3237 /* We had better avoid designating a charset of
3238 CHARS96 to REG 0 as far as possible. */
3239 if (CHARSET_CHARS (charset) == 96)
3240 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3241 = (reg_bits & 2
3242 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3243 else
3244 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3245 = (reg_bits & 1
3246 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3247 }
3248 }
3249 }
3250 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3251 coding->spec.iso2022.last_invalid_designation_register = -1;
3252 break;
3253
3254 case 3:
3255 coding->type = coding_type_big5;
3256 coding->common_flags
3257 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3258 coding->flags
3259 = (NILP (XVECTOR (coding_spec)->contents[4])
3260 ? CODING_FLAG_BIG5_HKU
3261 : CODING_FLAG_BIG5_ETEN);
3262 break;
3263
3264 case 4:
3265 coding->type = coding_type_ccl;
3266 coding->common_flags
3267 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3268 {
3269 val = XVECTOR (coding_spec)->contents[4];
3270 if (! CONSP (val)
3271 || setup_ccl_program (&(coding->spec.ccl.decoder),
3272 XCAR (val)) < 0
3273 || setup_ccl_program (&(coding->spec.ccl.encoder),
3274 XCDR (val)) < 0)
3275 goto label_invalid_coding_system;
3276
3277 bzero (coding->spec.ccl.valid_codes, 256);
3278 val = Fplist_get (plist, Qvalid_codes);
3279 if (CONSP (val))
3280 {
3281 Lisp_Object this;
3282
3283 for (; CONSP (val); val = XCDR (val))
3284 {
3285 this = XCAR (val);
3286 if (INTEGERP (this)
3287 && XINT (this) >= 0 && XINT (this) < 256)
3288 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3289 else if (CONSP (this)
3290 && INTEGERP (XCAR (this))
3291 && INTEGERP (XCDR (this)))
3292 {
3293 int start = XINT (XCAR (this));
3294 int end = XINT (XCDR (this));
3295
3296 if (start >= 0 && start <= end && end < 256)
3297 while (start <= end)
3298 coding->spec.ccl.valid_codes[start++] = 1;
3299 }
3300 }
3301 }
3302 }
3303 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3304 break;
3305
3306 case 5:
3307 coding->type = coding_type_raw_text;
3308 break;
3309
3310 default:
3311 goto label_invalid_coding_system;
3312 }
3313 return 0;
3314
3315 label_invalid_coding_system:
3316 coding->type = coding_type_no_conversion;
3317 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3318 coding->common_flags = 0;
3319 coding->eol_type = CODING_EOL_LF;
3320 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3321 return -1;
3322 }
3323
3324 /* Free memory blocks allocated for storing composition information. */
3325
3326 void
3327 coding_free_composition_data (coding)
3328 struct coding_system *coding;
3329 {
3330 struct composition_data *cmp_data = coding->cmp_data, *next;
3331
3332 if (!cmp_data)
3333 return;
3334 /* Memory blocks are chained. At first, rewind to the first, then,
3335 free blocks one by one. */
3336 while (cmp_data->prev)
3337 cmp_data = cmp_data->prev;
3338 while (cmp_data)
3339 {
3340 next = cmp_data->next;
3341 xfree (cmp_data);
3342 cmp_data = next;
3343 }
3344 coding->cmp_data = NULL;
3345 }
3346
3347 /* Set `char_offset' member of all memory blocks pointed by
3348 coding->cmp_data to POS. */
3349
3350 void
3351 coding_adjust_composition_offset (coding, pos)
3352 struct coding_system *coding;
3353 int pos;
3354 {
3355 struct composition_data *cmp_data;
3356
3357 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3358 cmp_data->char_offset = pos;
3359 }
3360
3361 /* Setup raw-text or one of its subsidiaries in the structure
3362 coding_system CODING according to the already setup value eol_type
3363 in CODING. CODING should be setup for some coding system in
3364 advance. */
3365
3366 void
3367 setup_raw_text_coding_system (coding)
3368 struct coding_system *coding;
3369 {
3370 if (coding->type != coding_type_raw_text)
3371 {
3372 coding->symbol = Qraw_text;
3373 coding->type = coding_type_raw_text;
3374 if (coding->eol_type != CODING_EOL_UNDECIDED)
3375 {
3376 Lisp_Object subsidiaries;
3377 subsidiaries = Fget (Qraw_text, Qeol_type);
3378
3379 if (VECTORP (subsidiaries)
3380 && XVECTOR (subsidiaries)->size == 3)
3381 coding->symbol
3382 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3383 }
3384 setup_coding_system (coding->symbol, coding);
3385 }
3386 return;
3387 }
3388
3389 /* Emacs has a mechanism to automatically detect a coding system if it
3390 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3391 it's impossible to distinguish some coding systems accurately
3392 because they use the same range of codes. So, at first, coding
3393 systems are categorized into 7, those are:
3394
3395 o coding-category-emacs-mule
3396
3397 The category for a coding system which has the same code range
3398 as Emacs' internal format. Assigned the coding-system (Lisp
3399 symbol) `emacs-mule' by default.
3400
3401 o coding-category-sjis
3402
3403 The category for a coding system which has the same code range
3404 as SJIS. Assigned the coding-system (Lisp
3405 symbol) `japanese-shift-jis' by default.
3406
3407 o coding-category-iso-7
3408
3409 The category for a coding system which has the same code range
3410 as ISO2022 of 7-bit environment. This doesn't use any locking
3411 shift and single shift functions. This can encode/decode all
3412 charsets. Assigned the coding-system (Lisp symbol)
3413 `iso-2022-7bit' by default.
3414
3415 o coding-category-iso-7-tight
3416
3417 Same as coding-category-iso-7 except that this can
3418 encode/decode only the specified charsets.
3419
3420 o coding-category-iso-8-1
3421
3422 The category for a coding system which has the same code range
3423 as ISO2022 of 8-bit environment and graphic plane 1 used only
3424 for DIMENSION1 charset. This doesn't use any locking shift
3425 and single shift functions. Assigned the coding-system (Lisp
3426 symbol) `iso-latin-1' by default.
3427
3428 o coding-category-iso-8-2
3429
3430 The category for a coding system which has the same code range
3431 as ISO2022 of 8-bit environment and graphic plane 1 used only
3432 for DIMENSION2 charset. This doesn't use any locking shift
3433 and single shift functions. Assigned the coding-system (Lisp
3434 symbol) `japanese-iso-8bit' by default.
3435
3436 o coding-category-iso-7-else
3437
3438 The category for a coding system which has the same code range
3439 as ISO2022 of 7-bit environemnt but uses locking shift or
3440 single shift functions. Assigned the coding-system (Lisp
3441 symbol) `iso-2022-7bit-lock' by default.
3442
3443 o coding-category-iso-8-else
3444
3445 The category for a coding system which has the same code range
3446 as ISO2022 of 8-bit environemnt but uses locking shift or
3447 single shift functions. Assigned the coding-system (Lisp
3448 symbol) `iso-2022-8bit-ss2' by default.
3449
3450 o coding-category-big5
3451
3452 The category for a coding system which has the same code range
3453 as BIG5. Assigned the coding-system (Lisp symbol)
3454 `cn-big5' by default.
3455
3456 o coding-category-ccl
3457
3458 The category for a coding system of which encoder/decoder is
3459 written in CCL programs. The default value is nil, i.e., no
3460 coding system is assigned.
3461
3462 o coding-category-binary
3463
3464 The category for a coding system not categorized in any of the
3465 above. Assigned the coding-system (Lisp symbol)
3466 `no-conversion' by default.
3467
3468 Each of them is a Lisp symbol and the value is an actual
3469 `coding-system's (this is also a Lisp symbol) assigned by a user.
3470 What Emacs does actually is to detect a category of coding system.
3471 Then, it uses a `coding-system' assigned to it. If Emacs can't
3472 decide only one possible category, it selects a category of the
3473 highest priority. Priorities of categories are also specified by a
3474 user in a Lisp variable `coding-category-list'.
3475
3476 */
3477
3478 static
3479 int ascii_skip_code[256];
3480
3481 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3482 If it detects possible coding systems, return an integer in which
3483 appropriate flag bits are set. Flag bits are defined by macros
3484 CODING_CATEGORY_MASK_XXX in `coding.h'.
3485
3486 How many ASCII characters are at the head is returned as *SKIP. */
3487
3488 static int
3489 detect_coding_mask (source, src_bytes, priorities, skip)
3490 unsigned char *source;
3491 int src_bytes, *priorities, *skip;
3492 {
3493 register unsigned char c;
3494 unsigned char *src = source, *src_end = source + src_bytes;
3495 unsigned int mask;
3496 int i;
3497
3498 /* At first, skip all ASCII characters and control characters except
3499 for three ISO2022 specific control characters. */
3500 ascii_skip_code[ISO_CODE_SO] = 0;
3501 ascii_skip_code[ISO_CODE_SI] = 0;
3502 ascii_skip_code[ISO_CODE_ESC] = 0;
3503
3504 label_loop_detect_coding:
3505 while (src < src_end && ascii_skip_code[*src]) src++;
3506 *skip = src - source;
3507
3508 if (src >= src_end)
3509 /* We found nothing other than ASCII. There's nothing to do. */
3510 return 0;
3511
3512 c = *src;
3513 /* The text seems to be encoded in some multilingual coding system.
3514 Now, try to find in which coding system the text is encoded. */
3515 if (c < 0x80)
3516 {
3517 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3518 /* C is an ISO2022 specific control code of C0. */
3519 mask = detect_coding_iso2022 (src, src_end);
3520 if (mask == 0)
3521 {
3522 /* No valid ISO2022 code follows C. Try again. */
3523 src++;
3524 if (c == ISO_CODE_ESC)
3525 ascii_skip_code[ISO_CODE_ESC] = 1;
3526 else
3527 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3528 goto label_loop_detect_coding;
3529 }
3530 if (priorities)
3531 goto label_return_highest_only;
3532 }
3533 else
3534 {
3535 int try;
3536
3537 if (c < 0xA0)
3538 {
3539 /* C is the first byte of SJIS character code,
3540 or a leading-code of Emacs' internal format (emacs-mule). */
3541 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3542
3543 /* Or, if C is a special latin extra code,
3544 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3545 or is an ISO2022 control-sequence-introducer (CSI),
3546 we should also consider the possibility of ISO2022 codings. */
3547 if ((VECTORP (Vlatin_extra_code_table)
3548 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3549 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3550 || (c == ISO_CODE_CSI
3551 && (src < src_end
3552 && (*src == ']'
3553 || ((*src == '0' || *src == '1' || *src == '2')
3554 && src + 1 < src_end
3555 && src[1] == ']')))))
3556 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3557 | CODING_CATEGORY_MASK_ISO_8BIT);
3558 }
3559 else
3560 /* C is a character of ISO2022 in graphic plane right,
3561 or a SJIS's 1-byte character code (i.e. JISX0201),
3562 or the first byte of BIG5's 2-byte code. */
3563 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3564 | CODING_CATEGORY_MASK_ISO_8BIT
3565 | CODING_CATEGORY_MASK_SJIS
3566 | CODING_CATEGORY_MASK_BIG5);
3567
3568 /* Or, we may have to consider the possibility of CCL. */
3569 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3570 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3571 ->spec.ccl.valid_codes)[c])
3572 try |= CODING_CATEGORY_MASK_CCL;
3573
3574 mask = 0;
3575 if (priorities)
3576 {
3577 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3578 {
3579 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3580 mask = detect_coding_iso2022 (src, src_end);
3581 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3582 mask = detect_coding_sjis (src, src_end);
3583 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3584 mask = detect_coding_big5 (src, src_end);
3585 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3586 mask = detect_coding_emacs_mule (src, src_end);
3587 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3588 mask = detect_coding_ccl (src, src_end);
3589 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3590 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3591 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3592 mask = CODING_CATEGORY_MASK_BINARY;
3593 if (mask)
3594 goto label_return_highest_only;
3595 }
3596 return CODING_CATEGORY_MASK_RAW_TEXT;
3597 }
3598 if (try & CODING_CATEGORY_MASK_ISO)
3599 mask |= detect_coding_iso2022 (src, src_end);
3600 if (try & CODING_CATEGORY_MASK_SJIS)
3601 mask |= detect_coding_sjis (src, src_end);
3602 if (try & CODING_CATEGORY_MASK_BIG5)
3603 mask |= detect_coding_big5 (src, src_end);
3604 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3605 mask |= detect_coding_emacs_mule (src, src_end);
3606 if (try & CODING_CATEGORY_MASK_CCL)
3607 mask |= detect_coding_ccl (src, src_end);
3608 }
3609 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3610
3611 label_return_highest_only:
3612 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3613 {
3614 if (mask & priorities[i])
3615 return priorities[i];
3616 }
3617 return CODING_CATEGORY_MASK_RAW_TEXT;
3618 }
3619
3620 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3621 The information of the detected coding system is set in CODING. */
3622
3623 void
3624 detect_coding (coding, src, src_bytes)
3625 struct coding_system *coding;
3626 unsigned char *src;
3627 int src_bytes;
3628 {
3629 unsigned int idx;
3630 int skip, mask, i;
3631 Lisp_Object val;
3632
3633 val = Vcoding_category_list;
3634 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3635 coding->heading_ascii = skip;
3636
3637 if (!mask) return;
3638
3639 /* We found a single coding system of the highest priority in MASK. */
3640 idx = 0;
3641 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3642 if (! mask)
3643 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3644
3645 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3646
3647 if (coding->eol_type != CODING_EOL_UNDECIDED)
3648 {
3649 Lisp_Object tmp;
3650
3651 tmp = Fget (val, Qeol_type);
3652 if (VECTORP (tmp))
3653 val = XVECTOR (tmp)->contents[coding->eol_type];
3654 }
3655 setup_coding_system (val, coding);
3656 /* Set this again because setup_coding_system reset this member. */
3657 coding->heading_ascii = skip;
3658 }
3659
3660 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3661 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3662 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3663
3664 How many non-eol characters are at the head is returned as *SKIP. */
3665
3666 #define MAX_EOL_CHECK_COUNT 3
3667
3668 static int
3669 detect_eol_type (source, src_bytes, skip)
3670 unsigned char *source;
3671 int src_bytes, *skip;
3672 {
3673 unsigned char *src = source, *src_end = src + src_bytes;
3674 unsigned char c;
3675 int total = 0; /* How many end-of-lines are found so far. */
3676 int eol_type = CODING_EOL_UNDECIDED;
3677 int this_eol_type;
3678
3679 *skip = 0;
3680
3681 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3682 {
3683 c = *src++;
3684 if (c == '\n' || c == '\r')
3685 {
3686 if (*skip == 0)
3687 *skip = src - 1 - source;
3688 total++;
3689 if (c == '\n')
3690 this_eol_type = CODING_EOL_LF;
3691 else if (src >= src_end || *src != '\n')
3692 this_eol_type = CODING_EOL_CR;
3693 else
3694 this_eol_type = CODING_EOL_CRLF, src++;
3695
3696 if (eol_type == CODING_EOL_UNDECIDED)
3697 /* This is the first end-of-line. */
3698 eol_type = this_eol_type;
3699 else if (eol_type != this_eol_type)
3700 {
3701 /* The found type is different from what found before. */
3702 eol_type = CODING_EOL_INCONSISTENT;
3703 break;
3704 }
3705 }
3706 }
3707
3708 if (*skip == 0)
3709 *skip = src_end - source;
3710 return eol_type;
3711 }
3712
3713 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3714 is encoded. If it detects an appropriate format of end-of-line, it
3715 sets the information in *CODING. */
3716
3717 void
3718 detect_eol (coding, src, src_bytes)
3719 struct coding_system *coding;
3720 unsigned char *src;
3721 int src_bytes;
3722 {
3723 Lisp_Object val;
3724 int skip;
3725 int eol_type = detect_eol_type (src, src_bytes, &skip);
3726
3727 if (coding->heading_ascii > skip)
3728 coding->heading_ascii = skip;
3729 else
3730 skip = coding->heading_ascii;
3731
3732 if (eol_type == CODING_EOL_UNDECIDED)
3733 return;
3734 if (eol_type == CODING_EOL_INCONSISTENT)
3735 {
3736 #if 0
3737 /* This code is suppressed until we find a better way to
3738 distinguish raw text file and binary file. */
3739
3740 /* If we have already detected that the coding is raw-text, the
3741 coding should actually be no-conversion. */
3742 if (coding->type == coding_type_raw_text)
3743 {
3744 setup_coding_system (Qno_conversion, coding);
3745 return;
3746 }
3747 /* Else, let's decode only text code anyway. */
3748 #endif /* 0 */
3749 eol_type = CODING_EOL_LF;
3750 }
3751
3752 val = Fget (coding->symbol, Qeol_type);
3753 if (VECTORP (val) && XVECTOR (val)->size == 3)
3754 {
3755 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3756 coding->heading_ascii = skip;
3757 }
3758 }
3759
3760 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3761
3762 #define DECODING_BUFFER_MAG(coding) \
3763 (coding->type == coding_type_iso2022 \
3764 ? 3 \
3765 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3766 ? 2 \
3767 : (coding->type == coding_type_raw_text \
3768 ? 1 \
3769 : (coding->type == coding_type_ccl \
3770 ? coding->spec.ccl.decoder.buf_magnification \
3771 : 2))))
3772
3773 /* Return maximum size (bytes) of a buffer enough for decoding
3774 SRC_BYTES of text encoded in CODING. */
3775
3776 int
3777 decoding_buffer_size (coding, src_bytes)
3778 struct coding_system *coding;
3779 int src_bytes;
3780 {
3781 return (src_bytes * DECODING_BUFFER_MAG (coding)
3782 + CONVERSION_BUFFER_EXTRA_ROOM);
3783 }
3784
3785 /* Return maximum size (bytes) of a buffer enough for encoding
3786 SRC_BYTES of text to CODING. */
3787
3788 int
3789 encoding_buffer_size (coding, src_bytes)
3790 struct coding_system *coding;
3791 int src_bytes;
3792 {
3793 int magnification;
3794
3795 if (coding->type == coding_type_ccl)
3796 magnification = coding->spec.ccl.encoder.buf_magnification;
3797 else
3798 magnification = 3;
3799
3800 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3801 }
3802
3803 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3804 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3805 #endif
3806
3807 char *conversion_buffer;
3808 int conversion_buffer_size;
3809
3810 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3811 or decoding. Sufficient memory is allocated automatically. If we
3812 run out of memory, return NULL. */
3813
3814 char *
3815 get_conversion_buffer (size)
3816 int size;
3817 {
3818 if (size > conversion_buffer_size)
3819 {
3820 char *buf;
3821 int real_size = conversion_buffer_size * 2;
3822
3823 while (real_size < size) real_size *= 2;
3824 buf = (char *) xmalloc (real_size);
3825 xfree (conversion_buffer);
3826 conversion_buffer = buf;
3827 conversion_buffer_size = real_size;
3828 }
3829 return conversion_buffer;
3830 }
3831
3832 int
3833 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3834 struct coding_system *coding;
3835 unsigned char *source, *destination;
3836 int src_bytes, dst_bytes, encodep;
3837 {
3838 struct ccl_program *ccl
3839 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3840 int result;
3841
3842 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3843
3844 coding->produced = ccl_driver (ccl, source, destination,
3845 src_bytes, dst_bytes, &(coding->consumed));
3846 coding->produced_char
3847 = (encodep
3848 ? coding->produced
3849 : multibyte_chars_in_text (destination, coding->produced));
3850 coding->consumed_char
3851 = multibyte_chars_in_text (source, coding->consumed);
3852
3853 switch (ccl->status)
3854 {
3855 case CCL_STAT_SUSPEND_BY_SRC:
3856 result = CODING_FINISH_INSUFFICIENT_SRC;
3857 break;
3858 case CCL_STAT_SUSPEND_BY_DST:
3859 result = CODING_FINISH_INSUFFICIENT_DST;
3860 break;
3861 case CCL_STAT_QUIT:
3862 case CCL_STAT_INVALID_CMD:
3863 result = CODING_FINISH_INTERRUPT;
3864 break;
3865 default:
3866 result = CODING_FINISH_NORMAL;
3867 break;
3868 }
3869 return result;
3870 }
3871
3872 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3873 decoding, it may detect coding system and format of end-of-line if
3874 those are not yet decided.
3875
3876 This function does not make full use of DESTINATION buffer. For
3877 instance, if coding->type is coding_type_iso2022, it uses only
3878 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3879 DST_BYTES is decided by the function decoding_buffer_size, it
3880 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3881 So, this function can decode the full SOURCE. But, in the other
3882 case, if you want to avoid carry over, you must supply at least 7
3883 bytes more area in DESTINATION buffer than expected maximum bytes
3884 that will be produced by this function. */
3885
3886 int
3887 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3888 struct coding_system *coding;
3889 unsigned char *source, *destination;
3890 int src_bytes, dst_bytes;
3891 {
3892 int result;
3893
3894 if (src_bytes <= 0
3895 && coding->type != coding_type_ccl
3896 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3897 && CODING_REQUIRE_FLUSHING (coding)))
3898 {
3899 coding->produced = coding->produced_char = 0;
3900 coding->consumed = coding->consumed_char = 0;
3901 coding->fake_multibyte = 0;
3902 return CODING_FINISH_NORMAL;
3903 }
3904
3905 if (coding->type == coding_type_undecided)
3906 detect_coding (coding, source, src_bytes);
3907
3908 if (coding->eol_type == CODING_EOL_UNDECIDED)
3909 detect_eol (coding, source, src_bytes);
3910
3911 switch (coding->type)
3912 {
3913 case coding_type_emacs_mule:
3914 case coding_type_undecided:
3915 case coding_type_raw_text:
3916 if (coding->eol_type == CODING_EOL_LF
3917 || coding->eol_type == CODING_EOL_UNDECIDED)
3918 goto label_no_conversion;
3919 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3920 break;
3921
3922 case coding_type_sjis:
3923 result = decode_coding_sjis_big5 (coding, source, destination,
3924 src_bytes, dst_bytes, 1);
3925 break;
3926
3927 case coding_type_iso2022:
3928 result = decode_coding_iso2022 (coding, source, destination,
3929 src_bytes, dst_bytes);
3930 break;
3931
3932 case coding_type_big5:
3933 result = decode_coding_sjis_big5 (coding, source, destination,
3934 src_bytes, dst_bytes, 0);
3935 break;
3936
3937 case coding_type_ccl:
3938 result = ccl_coding_driver (coding, source, destination,
3939 src_bytes, dst_bytes, 0);
3940 break;
3941
3942 default: /* i.e. case coding_type_no_conversion: */
3943 label_no_conversion:
3944 if (dst_bytes && src_bytes > dst_bytes)
3945 {
3946 coding->produced = dst_bytes;
3947 result = CODING_FINISH_INSUFFICIENT_DST;
3948 }
3949 else
3950 {
3951 coding->produced = src_bytes;
3952 result = CODING_FINISH_NORMAL;
3953 }
3954 if (dst_bytes)
3955 bcopy (source, destination, coding->produced);
3956 else
3957 safe_bcopy (source, destination, coding->produced);
3958 coding->fake_multibyte = 1;
3959 coding->consumed
3960 = coding->consumed_char = coding->produced_char = coding->produced;
3961 break;
3962 }
3963
3964 return result;
3965 }
3966
3967 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3968
3969 This function does not make full use of DESTINATION buffer. For
3970 instance, if coding->type is coding_type_iso2022, it uses only
3971 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3972 DST_BYTES is decided by the function encoding_buffer_size, it
3973 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3974 So, this function can encode the full SOURCE. But, in the other
3975 case, if you want to avoid carry over, you must supply at least 20
3976 bytes more area in DESTINATION buffer than expected maximum bytes
3977 that will be produced by this function. */
3978
3979 int
3980 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3981 struct coding_system *coding;
3982 unsigned char *source, *destination;
3983 int src_bytes, dst_bytes;
3984 {
3985 int result;
3986
3987 if (src_bytes <= 0
3988 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3989 && CODING_REQUIRE_FLUSHING (coding)))
3990 {
3991 coding->produced = coding->produced_char = 0;
3992 coding->consumed = coding->consumed_char = 0;
3993 coding->fake_multibyte = 0;
3994 return CODING_FINISH_NORMAL;
3995 }
3996
3997 switch (coding->type)
3998 {
3999 case coding_type_emacs_mule:
4000 case coding_type_undecided:
4001 case coding_type_raw_text:
4002 if (coding->eol_type == CODING_EOL_LF
4003 || coding->eol_type == CODING_EOL_UNDECIDED)
4004 goto label_no_conversion;
4005 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4006 break;
4007
4008 case coding_type_sjis:
4009 result = encode_coding_sjis_big5 (coding, source, destination,
4010 src_bytes, dst_bytes, 1);
4011 break;
4012
4013 case coding_type_iso2022:
4014 result = encode_coding_iso2022 (coding, source, destination,
4015 src_bytes, dst_bytes);
4016 break;
4017
4018 case coding_type_big5:
4019 result = encode_coding_sjis_big5 (coding, source, destination,
4020 src_bytes, dst_bytes, 0);
4021 break;
4022
4023 case coding_type_ccl:
4024 result = ccl_coding_driver (coding, source, destination,
4025 src_bytes, dst_bytes, 1);
4026 break;
4027
4028 default: /* i.e. case coding_type_no_conversion: */
4029 label_no_conversion:
4030 if (dst_bytes && src_bytes > dst_bytes)
4031 {
4032 coding->produced = dst_bytes;
4033 result = CODING_FINISH_INSUFFICIENT_DST;
4034 }
4035 else
4036 {
4037 coding->produced = src_bytes;
4038 result = CODING_FINISH_NORMAL;
4039 }
4040 if (dst_bytes)
4041 bcopy (source, destination, coding->produced);
4042 else
4043 safe_bcopy (source, destination, coding->produced);
4044 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
4045 {
4046 unsigned char *p = destination, *pend = p + coding->produced;
4047 while (p < pend)
4048 if (*p++ == '\015') p[-1] = '\n';
4049 }
4050 coding->fake_multibyte = 1;
4051 coding->consumed
4052 = coding->consumed_char = coding->produced_char = coding->produced;
4053 break;
4054 }
4055
4056 return result;
4057 }
4058
4059 /* Scan text in the region between *BEG and *END (byte positions),
4060 skip characters which we don't have to decode by coding system
4061 CODING at the head and tail, then set *BEG and *END to the region
4062 of the text we actually have to convert. The caller should move
4063 the gap out of the region in advance.
4064
4065 If STR is not NULL, *BEG and *END are indices into STR. */
4066
4067 static void
4068 shrink_decoding_region (beg, end, coding, str)
4069 int *beg, *end;
4070 struct coding_system *coding;
4071 unsigned char *str;
4072 {
4073 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4074 int eol_conversion;
4075 Lisp_Object translation_table;
4076
4077 if (coding->type == coding_type_ccl
4078 || coding->type == coding_type_undecided
4079 || !NILP (coding->post_read_conversion))
4080 {
4081 /* We can't skip any data. */
4082 return;
4083 }
4084 else if (coding->type == coding_type_no_conversion)
4085 {
4086 /* We need no conversion, but don't have to skip any data here.
4087 Decoding routine handles them effectively anyway. */
4088 return;
4089 }
4090
4091 translation_table = coding->translation_table_for_decode;
4092 if (NILP (translation_table) && !NILP (Venable_character_translation))
4093 translation_table = Vstandard_translation_table_for_decode;
4094 if (CHAR_TABLE_P (translation_table))
4095 {
4096 int i;
4097 for (i = 0; i < 128; i++)
4098 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4099 break;
4100 if (i < 128)
4101 /* Some ASCII character should be tranlsated. We give up
4102 shrinking. */
4103 return;
4104 }
4105
4106 eol_conversion = (coding->eol_type != CODING_EOL_LF);
4107
4108 if ((! eol_conversion) && (coding->heading_ascii >= 0))
4109 /* Detection routine has already found how much we can skip at the
4110 head. */
4111 *beg += coding->heading_ascii;
4112
4113 if (str)
4114 {
4115 begp_orig = begp = str + *beg;
4116 endp_orig = endp = str + *end;
4117 }
4118 else
4119 {
4120 begp_orig = begp = BYTE_POS_ADDR (*beg);
4121 endp_orig = endp = begp + *end - *beg;
4122 }
4123
4124 switch (coding->type)
4125 {
4126 case coding_type_emacs_mule:
4127 case coding_type_raw_text:
4128 if (eol_conversion)
4129 {
4130 if (coding->heading_ascii < 0)
4131 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
4132 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
4133 endp--;
4134 /* Do not consider LF as ascii if preceded by CR, since that
4135 confuses eol decoding. */
4136 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4137 endp++;
4138 }
4139 else
4140 begp = endp;
4141 break;
4142
4143 case coding_type_sjis:
4144 case coding_type_big5:
4145 /* We can skip all ASCII characters at the head. */
4146 if (coding->heading_ascii < 0)
4147 {
4148 if (eol_conversion)
4149 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4150 else
4151 while (begp < endp && *begp < 0x80) begp++;
4152 }
4153 /* We can skip all ASCII characters at the tail except for the
4154 second byte of SJIS or BIG5 code. */
4155 if (eol_conversion)
4156 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4157 else
4158 while (begp < endp && endp[-1] < 0x80) endp--;
4159 /* Do not consider LF as ascii if preceded by CR, since that
4160 confuses eol decoding. */
4161 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4162 endp++;
4163 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4164 endp++;
4165 break;
4166
4167 default: /* i.e. case coding_type_iso2022: */
4168 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4169 /* We can't skip any data. */
4170 break;
4171 if (coding->heading_ascii < 0)
4172 {
4173 /* We can skip all ASCII characters at the head except for a
4174 few control codes. */
4175 while (begp < endp && (c = *begp) < 0x80
4176 && c != ISO_CODE_CR && c != ISO_CODE_SO
4177 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4178 && (!eol_conversion || c != ISO_CODE_LF))
4179 begp++;
4180 }
4181 switch (coding->category_idx)
4182 {
4183 case CODING_CATEGORY_IDX_ISO_8_1:
4184 case CODING_CATEGORY_IDX_ISO_8_2:
4185 /* We can skip all ASCII characters at the tail. */
4186 if (eol_conversion)
4187 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4188 else
4189 while (begp < endp && endp[-1] < 0x80) endp--;
4190 /* Do not consider LF as ascii if preceded by CR, since that
4191 confuses eol decoding. */
4192 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4193 endp++;
4194 break;
4195
4196 case CODING_CATEGORY_IDX_ISO_7:
4197 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4198 {
4199 /* We can skip all charactes at the tail except for 8-bit
4200 codes and ESC and the following 2-byte at the tail. */
4201 unsigned char *eight_bit = NULL;
4202
4203 if (eol_conversion)
4204 while (begp < endp
4205 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4206 {
4207 if (!eight_bit && c & 0x80) eight_bit = endp;
4208 endp--;
4209 }
4210 else
4211 while (begp < endp
4212 && (c = endp[-1]) != ISO_CODE_ESC)
4213 {
4214 if (!eight_bit && c & 0x80) eight_bit = endp;
4215 endp--;
4216 }
4217 /* Do not consider LF as ascii if preceded by CR, since that
4218 confuses eol decoding. */
4219 if (begp < endp && endp < endp_orig
4220 && endp[-1] == '\r' && endp[0] == '\n')
4221 endp++;
4222 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4223 {
4224 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4225 /* This is an ASCII designation sequence. We can
4226 surely skip the tail. But, if we have
4227 encountered an 8-bit code, skip only the codes
4228 after that. */
4229 endp = eight_bit ? eight_bit : endp + 2;
4230 else
4231 /* Hmmm, we can't skip the tail. */
4232 endp = endp_orig;
4233 }
4234 else if (eight_bit)
4235 endp = eight_bit;
4236 }
4237 }
4238 }
4239 *beg += begp - begp_orig;
4240 *end += endp - endp_orig;
4241 return;
4242 }
4243
4244 /* Like shrink_decoding_region but for encoding. */
4245
4246 static void
4247 shrink_encoding_region (beg, end, coding, str)
4248 int *beg, *end;
4249 struct coding_system *coding;
4250 unsigned char *str;
4251 {
4252 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4253 int eol_conversion;
4254 Lisp_Object translation_table;
4255
4256 if (coding->type == coding_type_ccl)
4257 /* We can't skip any data. */
4258 return;
4259 else if (coding->type == coding_type_no_conversion)
4260 {
4261 /* We need no conversion. */
4262 *beg = *end;
4263 return;
4264 }
4265
4266 translation_table = coding->translation_table_for_encode;
4267 if (NILP (translation_table) && !NILP (Venable_character_translation))
4268 translation_table = Vstandard_translation_table_for_encode;
4269 if (CHAR_TABLE_P (translation_table))
4270 {
4271 int i;
4272 for (i = 0; i < 128; i++)
4273 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4274 break;
4275 if (i < 128)
4276 /* Some ASCII character should be tranlsated. We give up
4277 shrinking. */
4278 return;
4279 }
4280
4281 if (str)
4282 {
4283 begp_orig = begp = str + *beg;
4284 endp_orig = endp = str + *end;
4285 }
4286 else
4287 {
4288 begp_orig = begp = BYTE_POS_ADDR (*beg);
4289 endp_orig = endp = begp + *end - *beg;
4290 }
4291
4292 eol_conversion = (coding->eol_type == CODING_EOL_CR
4293 || coding->eol_type == CODING_EOL_CRLF);
4294
4295 /* Here, we don't have to check coding->pre_write_conversion because
4296 the caller is expected to have handled it already. */
4297 switch (coding->type)
4298 {
4299 case coding_type_undecided:
4300 case coding_type_emacs_mule:
4301 case coding_type_raw_text:
4302 if (eol_conversion)
4303 {
4304 while (begp < endp && *begp != '\n') begp++;
4305 while (begp < endp && endp[-1] != '\n') endp--;
4306 }
4307 else
4308 begp = endp;
4309 break;
4310
4311 case coding_type_iso2022:
4312 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4313 /* We can't skip any data. */
4314 break;
4315 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4316 {
4317 unsigned char *bol = begp;
4318 while (begp < endp && *begp < 0x80)
4319 {
4320 begp++;
4321 if (begp[-1] == '\n')
4322 bol = begp;
4323 }
4324 begp = bol;
4325 goto label_skip_tail;
4326 }
4327 /* fall down ... */
4328
4329 default:
4330 /* We can skip all ASCII characters at the head and tail. */
4331 if (eol_conversion)
4332 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4333 else
4334 while (begp < endp && *begp < 0x80) begp++;
4335 label_skip_tail:
4336 if (eol_conversion)
4337 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4338 else
4339 while (begp < endp && *(endp - 1) < 0x80) endp--;
4340 break;
4341 }
4342
4343 *beg += begp - begp_orig;
4344 *end += endp - endp_orig;
4345 return;
4346 }
4347
4348 /* As shrinking conversion region requires some overhead, we don't try
4349 shrinking if the length of conversion region is less than this
4350 value. */
4351 static int shrink_conversion_region_threshhold = 1024;
4352
4353 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4354 do { \
4355 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4356 { \
4357 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4358 else shrink_decoding_region (beg, end, coding, str); \
4359 } \
4360 } while (0)
4361
4362 static Lisp_Object
4363 code_convert_region_unwind (dummy)
4364 Lisp_Object dummy;
4365 {
4366 inhibit_pre_post_conversion = 0;
4367 return Qnil;
4368 }
4369
4370 /* Store information about all compositions in the range FROM and TO
4371 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4372 buffer or a string, defaults to the current buffer. */
4373
4374 void
4375 coding_save_composition (coding, from, to, obj)
4376 struct coding_system *coding;
4377 int from, to;
4378 Lisp_Object obj;
4379 {
4380 Lisp_Object prop;
4381 int start, end;
4382
4383 coding->composing = COMPOSITION_DISABLED;
4384 if (!find_composition (from, to, &start, &end, &prop, obj)
4385 || end > to)
4386 return;
4387 if (start < from
4388 && (!find_composition (end, to, &start, &end, &prop, obj)
4389 || end > to))
4390 return;
4391 coding->composing = COMPOSITION_NO;
4392 coding_allocate_composition_data (coding, from);
4393 do
4394 {
4395 if (COMPOSITION_VALID_P (start, end, prop))
4396 {
4397 enum composition_method method = COMPOSITION_METHOD (prop);
4398 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4399 >= COMPOSITION_DATA_SIZE)
4400 coding_allocate_composition_data (coding, from);
4401 /* For relative composition, we remember start and end
4402 positions, for the other compositions, we also remember
4403 components. */
4404 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4405 if (method != COMPOSITION_RELATIVE)
4406 {
4407 /* We must store a*/
4408 Lisp_Object val, ch;
4409
4410 val = COMPOSITION_COMPONENTS (prop);
4411 if (CONSP (val))
4412 while (CONSP (val))
4413 {
4414 ch = XCAR (val), val = XCDR (val);
4415 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4416 }
4417 else if (VECTORP (val) || STRINGP (val))
4418 {
4419 int len = (VECTORP (val)
4420 ? XVECTOR (val)->size : XSTRING (val)->size);
4421 int i;
4422 for (i = 0; i < len; i++)
4423 {
4424 ch = (STRINGP (val)
4425 ? Faref (val, make_number (i))
4426 : XVECTOR (val)->contents[i]);
4427 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4428 }
4429 }
4430 else /* INTEGERP (val) */
4431 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4432 }
4433 CODING_ADD_COMPOSITION_END (coding, end - from);
4434 }
4435 start = end;
4436 }
4437 while (start < to
4438 && find_composition (start, to, &start, &end, &prop, obj)
4439 && end <= to);
4440
4441 /* Make coding->cmp_data point to the first memory block. */
4442 while (coding->cmp_data->prev)
4443 coding->cmp_data = coding->cmp_data->prev;
4444 coding->cmp_data_start = 0;
4445 }
4446
4447 /* Reflect the saved information about compositions to OBJ.
4448 CODING->cmp_data points to a memory block for the informaiton. OBJ
4449 is a buffer or a string, defaults to the current buffer. */
4450
4451 static void
4452 coding_restore_composition (coding, obj)
4453 struct coding_system *coding;
4454 Lisp_Object obj;
4455 {
4456 struct composition_data *cmp_data = coding->cmp_data;
4457
4458 if (!cmp_data)
4459 return;
4460
4461 while (cmp_data->prev)
4462 cmp_data = cmp_data->prev;
4463
4464 while (cmp_data)
4465 {
4466 int i;
4467
4468 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4469 {
4470 int *data = cmp_data->data + i;
4471 enum composition_method method = (enum composition_method) data[3];
4472 Lisp_Object components;
4473
4474 if (method == COMPOSITION_RELATIVE)
4475 components = Qnil;
4476 else
4477 {
4478 int len = data[0] - 4, j;
4479 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4480
4481 for (j = 0; j < len; j++)
4482 args[j] = make_number (data[4 + j]);
4483 components = (method == COMPOSITION_WITH_ALTCHARS
4484 ? Fstring (len, args) : Fvector (len, args));
4485 }
4486 compose_text (data[1], data[2], components, Qnil, obj);
4487 }
4488 cmp_data = cmp_data->next;
4489 }
4490 }
4491
4492 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4493 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4494 coding system CODING, and return the status code of code conversion
4495 (currently, this value has no meaning).
4496
4497 How many characters (and bytes) are converted to how many
4498 characters (and bytes) are recorded in members of the structure
4499 CODING.
4500
4501 If REPLACE is nonzero, we do various things as if the original text
4502 is deleted and a new text is inserted. See the comments in
4503 replace_range (insdel.c) to know what we are doing. */
4504
4505 int
4506 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4507 int from, from_byte, to, to_byte, encodep, replace;
4508 struct coding_system *coding;
4509 {
4510 int len = to - from, len_byte = to_byte - from_byte;
4511 int require, inserted, inserted_byte;
4512 int head_skip, tail_skip, total_skip;
4513 Lisp_Object saved_coding_symbol;
4514 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4515 int first = 1;
4516 int fake_multibyte = 0;
4517 unsigned char *src, *dst;
4518 Lisp_Object deletion;
4519 int orig_point = PT, orig_len = len;
4520 int prev_Z;
4521
4522 deletion = Qnil;
4523 saved_coding_symbol = Qnil;
4524
4525 if (from < PT && PT < to)
4526 {
4527 TEMP_SET_PT_BOTH (from, from_byte);
4528 orig_point = from;
4529 }
4530
4531 if (replace)
4532 {
4533 int saved_from = from;
4534
4535 prepare_to_modify_buffer (from, to, &from);
4536 if (saved_from != from)
4537 {
4538 to = from + len;
4539 if (multibyte)
4540 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4541 else
4542 from_byte = from, to_byte = to;
4543 len_byte = to_byte - from_byte;
4544 }
4545 }
4546
4547 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4548 {
4549 /* We must detect encoding of text and eol format. */
4550
4551 if (from < GPT && to > GPT)
4552 move_gap_both (from, from_byte);
4553 if (coding->type == coding_type_undecided)
4554 {
4555 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4556 if (coding->type == coding_type_undecided)
4557 /* It seems that the text contains only ASCII, but we
4558 should not left it undecided because the deeper
4559 decoding routine (decode_coding) tries to detect the
4560 encodings again in vain. */
4561 coding->type = coding_type_emacs_mule;
4562 }
4563 if (coding->eol_type == CODING_EOL_UNDECIDED)
4564 {
4565 saved_coding_symbol = coding->symbol;
4566 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4567 if (coding->eol_type == CODING_EOL_UNDECIDED)
4568 coding->eol_type = CODING_EOL_LF;
4569 /* We had better recover the original eol format if we
4570 encounter an inconsitent eol format while decoding. */
4571 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4572 }
4573 }
4574
4575 if (encodep
4576 ? ! CODING_REQUIRE_ENCODING (coding)
4577 : ! CODING_REQUIRE_DECODING (coding))
4578 {
4579 coding->consumed_char = len;
4580 coding->consumed = len_byte;
4581 coding->produced = len_byte;
4582 if (multibyte
4583 && ! replace
4584 /* See the comment of the member heading_ascii in coding.h. */
4585 && coding->heading_ascii < len_byte)
4586 {
4587 /* We still may have to combine byte at the head and the
4588 tail of the text in the region. */
4589 if (from < GPT && GPT < to)
4590 move_gap_both (to, to_byte);
4591 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4592 adjust_after_insert (from, from_byte, to, to_byte, len);
4593 coding->produced_char = len;
4594 }
4595 else
4596 {
4597 if (!replace)
4598 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4599 coding->produced_char = len_byte;
4600 }
4601 return 0;
4602 }
4603
4604 /* Now we convert the text. */
4605
4606 /* For encoding, we must process pre-write-conversion in advance. */
4607 if (encodep
4608 && ! NILP (coding->pre_write_conversion)
4609 && SYMBOLP (coding->pre_write_conversion)
4610 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4611 {
4612 /* The function in pre-write-conversion may put a new text in a
4613 new buffer. */
4614 struct buffer *prev = current_buffer;
4615 Lisp_Object new;
4616 int count = specpdl_ptr - specpdl;
4617
4618 record_unwind_protect (code_convert_region_unwind, Qnil);
4619 /* We should not call any more pre-write/post-read-conversion
4620 functions while this pre-write-conversion is running. */
4621 inhibit_pre_post_conversion = 1;
4622 call2 (coding->pre_write_conversion,
4623 make_number (from), make_number (to));
4624 inhibit_pre_post_conversion = 0;
4625 /* Discard the unwind protect. */
4626 specpdl_ptr--;
4627
4628 if (current_buffer != prev)
4629 {
4630 len = ZV - BEGV;
4631 new = Fcurrent_buffer ();
4632 set_buffer_internal_1 (prev);
4633 del_range_2 (from, from_byte, to, to_byte, 0);
4634 TEMP_SET_PT_BOTH (from, from_byte);
4635 insert_from_buffer (XBUFFER (new), 1, len, 0);
4636 Fkill_buffer (new);
4637 if (orig_point >= to)
4638 orig_point += len - orig_len;
4639 else if (orig_point > from)
4640 orig_point = from;
4641 orig_len = len;
4642 to = from + len;
4643 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4644 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4645 len_byte = to_byte - from_byte;
4646 TEMP_SET_PT_BOTH (from, from_byte);
4647 }
4648 }
4649
4650 if (replace)
4651 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4652
4653 if (coding->composing != COMPOSITION_DISABLED)
4654 {
4655 if (encodep)
4656 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4657 else
4658 coding_allocate_composition_data (coding, from);
4659 }
4660
4661 /* For conversion by CCL program and for encoding with composition
4662 handling, we can't skip any character because we may convert or
4663 compose even ASCII characters. */
4664 if (coding->type != coding_type_ccl
4665 && (!encodep || coding->cmp_data == NULL))
4666 {
4667 /* Try to skip the heading and tailing ASCIIs. */
4668 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4669
4670 if (from < GPT && GPT < to)
4671 move_gap_both (from, from_byte);
4672 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4673 if (from_byte == to_byte
4674 && (encodep || NILP (coding->post_read_conversion))
4675 && ! CODING_REQUIRE_FLUSHING (coding))
4676 {
4677 coding->produced = len_byte;
4678 coding->produced_char = multibyte ? len : len_byte;
4679 if (!replace)
4680 /* We must record and adjust for this new text now. */
4681 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4682 return 0;
4683 }
4684
4685 head_skip = from_byte - from_byte_orig;
4686 tail_skip = to_byte_orig - to_byte;
4687 total_skip = head_skip + tail_skip;
4688 from += head_skip;
4689 to -= tail_skip;
4690 len -= total_skip; len_byte -= total_skip;
4691
4692 if (coding->cmp_data)
4693 coding->cmp_data->char_offset = from;
4694 }
4695
4696 /* The code conversion routine can not preserve text properties for
4697 now. So, we must remove all text properties in the region.
4698 Here, we must suppress all modification hooks. */
4699 if (replace)
4700 {
4701 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4702 inhibit_modification_hooks = 1;
4703 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4704 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4705 }
4706
4707 /* For converion, we must put the gap before the text in addition to
4708 making the gap larger for efficient decoding. The required gap
4709 size starts from 2000 which is the magic number used in make_gap.
4710 But, after one batch of conversion, it will be incremented if we
4711 find that it is not enough . */
4712 require = 2000;
4713
4714 if (GAP_SIZE < require)
4715 make_gap (require - GAP_SIZE);
4716 move_gap_both (from, from_byte);
4717
4718 inserted = inserted_byte = 0;
4719
4720 GAP_SIZE += len_byte;
4721 ZV -= len;
4722 Z -= len;
4723 ZV_BYTE -= len_byte;
4724 Z_BYTE -= len_byte;
4725
4726 if (GPT - BEG < BEG_UNCHANGED)
4727 BEG_UNCHANGED = GPT - BEG;
4728 if (Z - GPT < END_UNCHANGED)
4729 END_UNCHANGED = Z - GPT;
4730
4731 for (;;)
4732 {
4733 int result;
4734
4735 /* The buffer memory is now:
4736 +--------+converted-text+---------+-------original-text------+---+
4737 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4738 |<------------------- GAP_SIZE -------------------->| */
4739 src = GAP_END_ADDR - len_byte;
4740 dst = GPT_ADDR + inserted_byte;
4741
4742 if (encodep)
4743 result = encode_coding (coding, src, dst, len_byte, 0);
4744 else
4745 result = decode_coding (coding, src, dst, len_byte, 0);
4746
4747 /* The buffer memory is now:
4748 +--------+-------converted-text--------+--+---original-text--+---+
4749 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4750 |<------------------- GAP_SIZE -------------------->| */
4751
4752 if (coding->fake_multibyte)
4753 fake_multibyte = 1;
4754
4755 if (!encodep && !multibyte)
4756 coding->produced_char = coding->produced;
4757 inserted += coding->produced_char;
4758 inserted_byte += coding->produced;
4759 len_byte -= coding->consumed;
4760
4761 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4762 {
4763 coding_allocate_composition_data (coding, from + inserted);
4764 continue;
4765 }
4766
4767 src += coding->consumed;
4768 dst += coding->produced;
4769
4770 if (result == CODING_FINISH_NORMAL)
4771 {
4772 src += len_byte;
4773 break;
4774 }
4775 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4776 {
4777 unsigned char *pend = dst, *p = pend - inserted_byte;
4778 Lisp_Object eol_type;
4779
4780 /* Encode LFs back to the original eol format (CR or CRLF). */
4781 if (coding->eol_type == CODING_EOL_CR)
4782 {
4783 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4784 }
4785 else
4786 {
4787 int count = 0;
4788
4789 while (p < pend) if (*p++ == '\n') count++;
4790 if (src - dst < count)
4791 {
4792 /* We don't have sufficient room for encoding LFs
4793 back to CRLF. We must record converted and
4794 not-yet-converted text back to the buffer
4795 content, enlarge the gap, then record them out of
4796 the buffer contents again. */
4797 int add = len_byte + inserted_byte;
4798
4799 GAP_SIZE -= add;
4800 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4801 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4802 make_gap (count - GAP_SIZE);
4803 GAP_SIZE += add;
4804 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4805 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4806 /* Don't forget to update SRC, DST, and PEND. */
4807 src = GAP_END_ADDR - len_byte;
4808 dst = GPT_ADDR + inserted_byte;
4809 pend = dst;
4810 }
4811 inserted += count;
4812 inserted_byte += count;
4813 coding->produced += count;
4814 p = dst = pend + count;
4815 while (count)
4816 {
4817 *--p = *--pend;
4818 if (*p == '\n') count--, *--p = '\r';
4819 }
4820 }
4821
4822 /* Suppress eol-format conversion in the further conversion. */
4823 coding->eol_type = CODING_EOL_LF;
4824
4825 /* Set the coding system symbol to that for Unix-like EOL. */
4826 eol_type = Fget (saved_coding_symbol, Qeol_type);
4827 if (VECTORP (eol_type)
4828 && XVECTOR (eol_type)->size == 3
4829 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4830 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4831 else
4832 coding->symbol = saved_coding_symbol;
4833
4834 continue;
4835 }
4836 if (len_byte <= 0)
4837 {
4838 if (coding->type != coding_type_ccl
4839 || coding->mode & CODING_MODE_LAST_BLOCK)
4840 break;
4841 coding->mode |= CODING_MODE_LAST_BLOCK;
4842 continue;
4843 }
4844 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4845 {
4846 /* The source text ends in invalid codes. Let's just
4847 make them valid buffer contents, and finish conversion. */
4848 inserted += len_byte;
4849 inserted_byte += len_byte;
4850 while (len_byte--)
4851 *dst++ = *src++;
4852 fake_multibyte = 1;
4853 break;
4854 }
4855 if (result == CODING_FINISH_INTERRUPT)
4856 {
4857 /* The conversion procedure was interrupted by a user. */
4858 fake_multibyte = 1;
4859 break;
4860 }
4861 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4862 if (coding->consumed < 1)
4863 {
4864 /* It's quite strange to require more memory without
4865 consuming any bytes. Perhaps CCL program bug. */
4866 fake_multibyte = 1;
4867 break;
4868 }
4869 if (first)
4870 {
4871 /* We have just done the first batch of conversion which was
4872 stoped because of insufficient gap. Let's reconsider the
4873 required gap size (i.e. SRT - DST) now.
4874
4875 We have converted ORIG bytes (== coding->consumed) into
4876 NEW bytes (coding->produced). To convert the remaining
4877 LEN bytes, we may need REQUIRE bytes of gap, where:
4878 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4879 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4880 Here, we are sure that NEW >= ORIG. */
4881 float ratio = coding->produced - coding->consumed;
4882 ratio /= coding->consumed;
4883 require = len_byte * ratio;
4884 first = 0;
4885 }
4886 if ((src - dst) < (require + 2000))
4887 {
4888 /* See the comment above the previous call of make_gap. */
4889 int add = len_byte + inserted_byte;
4890
4891 GAP_SIZE -= add;
4892 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4893 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4894 make_gap (require + 2000);
4895 GAP_SIZE += add;
4896 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4897 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4898 }
4899 }
4900 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4901
4902 if (multibyte
4903 && (encodep
4904 || fake_multibyte
4905 || (to - from) != (to_byte - from_byte)))
4906 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4907
4908 /* If we have shrinked the conversion area, adjust it now. */
4909 if (total_skip > 0)
4910 {
4911 if (tail_skip > 0)
4912 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4913 inserted += total_skip; inserted_byte += total_skip;
4914 GAP_SIZE += total_skip;
4915 GPT -= head_skip; GPT_BYTE -= head_skip;
4916 ZV -= total_skip; ZV_BYTE -= total_skip;
4917 Z -= total_skip; Z_BYTE -= total_skip;
4918 from -= head_skip; from_byte -= head_skip;
4919 to += tail_skip; to_byte += tail_skip;
4920 }
4921
4922 prev_Z = Z;
4923 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4924 inserted = Z - prev_Z;
4925
4926 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4927 coding_restore_composition (coding, Fcurrent_buffer ());
4928 coding_free_composition_data (coding);
4929
4930 if (! encodep && ! NILP (coding->post_read_conversion))
4931 {
4932 Lisp_Object val;
4933 int count = specpdl_ptr - specpdl;
4934
4935 if (from != PT)
4936 TEMP_SET_PT_BOTH (from, from_byte);
4937 prev_Z = Z;
4938 record_unwind_protect (code_convert_region_unwind, Qnil);
4939 /* We should not call any more pre-write/post-read-conversion
4940 functions while this post-read-conversion is running. */
4941 inhibit_pre_post_conversion = 1;
4942 val = call1 (coding->post_read_conversion, make_number (inserted));
4943 inhibit_pre_post_conversion = 0;
4944 /* Discard the unwind protect. */
4945 specpdl_ptr--;
4946 CHECK_NUMBER (val, 0);
4947 inserted += Z - prev_Z;
4948 }
4949
4950 if (orig_point >= from)
4951 {
4952 if (orig_point >= from + orig_len)
4953 orig_point += inserted - orig_len;
4954 else
4955 orig_point = from;
4956 TEMP_SET_PT (orig_point);
4957 }
4958
4959 if (replace)
4960 {
4961 signal_after_change (from, to - from, inserted);
4962 update_compositions (from, from + inserted, CHECK_BORDER);
4963 }
4964
4965 {
4966 coding->consumed = to_byte - from_byte;
4967 coding->consumed_char = to - from;
4968 coding->produced = inserted_byte;
4969 coding->produced_char = inserted;
4970 }
4971
4972 return 0;
4973 }
4974
4975 Lisp_Object
4976 code_convert_string (str, coding, encodep, nocopy)
4977 Lisp_Object str;
4978 struct coding_system *coding;
4979 int encodep, nocopy;
4980 {
4981 int len;
4982 char *buf;
4983 int from = 0, to = XSTRING (str)->size;
4984 int to_byte = STRING_BYTES (XSTRING (str));
4985 struct gcpro gcpro1;
4986 Lisp_Object saved_coding_symbol;
4987 int result;
4988
4989 saved_coding_symbol = Qnil;
4990 if ((encodep && !NILP (coding->pre_write_conversion)
4991 || !encodep && !NILP (coding->post_read_conversion)))
4992 {
4993 /* Since we have to call Lisp functions which assume target text
4994 is in a buffer, after setting a temporary buffer, call
4995 code_convert_region. */
4996 int count = specpdl_ptr - specpdl;
4997 struct buffer *prev = current_buffer;
4998 int multibyte = STRING_MULTIBYTE (str);
4999
5000 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5001 record_unwind_protect (code_convert_region_unwind, Qnil);
5002 inhibit_pre_post_conversion = 1;
5003 GCPRO1 (str);
5004 temp_output_buffer_setup (" *code-converting-work*");
5005 set_buffer_internal (XBUFFER (Vstandard_output));
5006 /* We must insert the contents of STR as is without
5007 unibyte<->multibyte conversion. For that, we adjust the
5008 multibyteness of the working buffer to that of STR. */
5009 Ferase_buffer (); /* for safety */
5010 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5011 insert_from_string (str, 0, 0, to, to_byte, 0);
5012 UNGCPRO;
5013 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
5014 /* Make a unibyte string if we are encoding, otherwise make a
5015 multibyte string. */
5016 Fset_buffer_multibyte (encodep ? Qnil : Qt);
5017 str = make_buffer_string (BEGV, ZV, 0);
5018 return unbind_to (count, str);
5019 }
5020
5021 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5022 {
5023 /* See the comments in code_convert_region. */
5024 if (coding->type == coding_type_undecided)
5025 {
5026 detect_coding (coding, XSTRING (str)->data, to_byte);
5027 if (coding->type == coding_type_undecided)
5028 coding->type = coding_type_emacs_mule;
5029 }
5030 if (coding->eol_type == CODING_EOL_UNDECIDED)
5031 {
5032 saved_coding_symbol = coding->symbol;
5033 detect_eol (coding, XSTRING (str)->data, to_byte);
5034 if (coding->eol_type == CODING_EOL_UNDECIDED)
5035 coding->eol_type = CODING_EOL_LF;
5036 /* We had better recover the original eol format if we
5037 encounter an inconsitent eol format while decoding. */
5038 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5039 }
5040 }
5041
5042 if (encodep
5043 ? ! CODING_REQUIRE_ENCODING (coding)
5044 : ! CODING_REQUIRE_DECODING (coding))
5045 return (nocopy ? str : Fcopy_sequence (str));
5046
5047 if (coding->composing != COMPOSITION_DISABLED)
5048 {
5049 if (encodep)
5050 coding_save_composition (coding, from, to, str);
5051 else
5052 coding_allocate_composition_data (coding, from);
5053 }
5054
5055 /* For conversion by CCL program and for encoding with composition
5056 handling, we can't skip any character because we may convert or
5057 compose even ASCII characters. */
5058 if (coding->type != coding_type_ccl
5059 && (!encodep || coding->cmp_data == NULL))
5060 {
5061 /* Try to skip the heading and tailing ASCIIs. */
5062 int from_orig = from;
5063
5064 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5065 encodep);
5066 if (from == to_byte)
5067 return (nocopy ? str : Fcopy_sequence (str));
5068
5069 if (coding->cmp_data)
5070 coding->cmp_data->char_offset = from;
5071 }
5072
5073 if (encodep)
5074 len = encoding_buffer_size (coding, to_byte - from);
5075 else
5076 len = decoding_buffer_size (coding, to_byte - from);
5077 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5078 GCPRO1 (str);
5079 buf = get_conversion_buffer (len);
5080 UNGCPRO;
5081
5082 if (from > 0)
5083 bcopy (XSTRING (str)->data, buf, from);
5084 result = (encodep
5085 ? encode_coding (coding, XSTRING (str)->data + from,
5086 buf + from, to_byte - from, len)
5087 : decode_coding (coding, XSTRING (str)->data + from,
5088 buf + from, to_byte - from, len));
5089 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5090 {
5091 /* We simply try to decode the whole string again but without
5092 eol-conversion this time. */
5093 coding->eol_type = CODING_EOL_LF;
5094 coding->symbol = saved_coding_symbol;
5095 coding_free_composition_data (coding);
5096 return code_convert_string (str, coding, encodep, nocopy);
5097 }
5098
5099 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5100 STRING_BYTES (XSTRING (str)) - to_byte);
5101
5102 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5103 if (encodep)
5104 str = make_unibyte_string (buf, len + coding->produced);
5105 else
5106 {
5107 int chars= (coding->fake_multibyte
5108 ? multibyte_chars_in_text (buf + from, coding->produced)
5109 : coding->produced_char);
5110 str = make_multibyte_string (buf, len + chars, len + coding->produced);
5111 }
5112
5113 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5114 coding_restore_composition (coding, str);
5115
5116 coding_free_composition_data (coding);
5117 return str;
5118 }
5119
5120 \f
5121 #ifdef emacs
5122 /*** 8. Emacs Lisp library functions ***/
5123
5124 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5125 "Return t if OBJECT is nil or a coding-system.\n\
5126 See the documentation of `make-coding-system' for information\n\
5127 about coding-system objects.")
5128 (obj)
5129 Lisp_Object obj;
5130 {
5131 if (NILP (obj))
5132 return Qt;
5133 if (!SYMBOLP (obj))
5134 return Qnil;
5135 /* Get coding-spec vector for OBJ. */
5136 obj = Fget (obj, Qcoding_system);
5137 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5138 ? Qt : Qnil);
5139 }
5140
5141 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5142 Sread_non_nil_coding_system, 1, 1, 0,
5143 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5144 (prompt)
5145 Lisp_Object prompt;
5146 {
5147 Lisp_Object val;
5148 do
5149 {
5150 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5151 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5152 }
5153 while (XSTRING (val)->size == 0);
5154 return (Fintern (val, Qnil));
5155 }
5156
5157 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5158 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5159 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5160 (prompt, default_coding_system)
5161 Lisp_Object prompt, default_coding_system;
5162 {
5163 Lisp_Object val;
5164 if (SYMBOLP (default_coding_system))
5165 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5166 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5167 Qt, Qnil, Qcoding_system_history,
5168 default_coding_system, Qnil);
5169 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5170 }
5171
5172 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5173 1, 1, 0,
5174 "Check validity of CODING-SYSTEM.\n\
5175 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5176 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5177 The value of property should be a vector of length 5.")
5178 (coding_system)
5179 Lisp_Object coding_system;
5180 {
5181 CHECK_SYMBOL (coding_system, 0);
5182 if (!NILP (Fcoding_system_p (coding_system)))
5183 return coding_system;
5184 while (1)
5185 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5186 }
5187 \f
5188 Lisp_Object
5189 detect_coding_system (src, src_bytes, highest)
5190 unsigned char *src;
5191 int src_bytes, highest;
5192 {
5193 int coding_mask, eol_type;
5194 Lisp_Object val, tmp;
5195 int dummy;
5196
5197 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5198 eol_type = detect_eol_type (src, src_bytes, &dummy);
5199 if (eol_type == CODING_EOL_INCONSISTENT)
5200 eol_type = CODING_EOL_UNDECIDED;
5201
5202 if (!coding_mask)
5203 {
5204 val = Qundecided;
5205 if (eol_type != CODING_EOL_UNDECIDED)
5206 {
5207 Lisp_Object val2;
5208 val2 = Fget (Qundecided, Qeol_type);
5209 if (VECTORP (val2))
5210 val = XVECTOR (val2)->contents[eol_type];
5211 }
5212 return (highest ? val : Fcons (val, Qnil));
5213 }
5214
5215 /* At first, gather possible coding systems in VAL. */
5216 val = Qnil;
5217 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp))
5218 {
5219 int idx
5220 = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
5221 if (coding_mask & (1 << idx))
5222 {
5223 val = Fcons (Fsymbol_value (XCAR (tmp)), val);
5224 if (highest)
5225 break;
5226 }
5227 }
5228 if (!highest)
5229 val = Fnreverse (val);
5230
5231 /* Then, replace the elements with subsidiary coding systems. */
5232 for (tmp = val; !NILP (tmp); tmp = XCDR (tmp))
5233 {
5234 if (eol_type != CODING_EOL_UNDECIDED
5235 && eol_type != CODING_EOL_INCONSISTENT)
5236 {
5237 Lisp_Object eol;
5238 eol = Fget (XCAR (tmp), Qeol_type);
5239 if (VECTORP (eol))
5240 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5241 }
5242 }
5243 return (highest ? XCAR (val) : val);
5244 }
5245
5246 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5247 2, 3, 0,
5248 "Detect coding system of the text in the region between START and END.\n\
5249 Return a list of possible coding systems ordered by priority.\n\
5250 \n\
5251 If only ASCII characters are found, it returns a list of single element\n\
5252 `undecided' or its subsidiary coding system according to a detected\n\
5253 end-of-line format.\n\
5254 \n\
5255 If optional argument HIGHEST is non-nil, return the coding system of\n\
5256 highest priority.")
5257 (start, end, highest)
5258 Lisp_Object start, end, highest;
5259 {
5260 int from, to;
5261 int from_byte, to_byte;
5262
5263 CHECK_NUMBER_COERCE_MARKER (start, 0);
5264 CHECK_NUMBER_COERCE_MARKER (end, 1);
5265
5266 validate_region (&start, &end);
5267 from = XINT (start), to = XINT (end);
5268 from_byte = CHAR_TO_BYTE (from);
5269 to_byte = CHAR_TO_BYTE (to);
5270
5271 if (from < GPT && to >= GPT)
5272 move_gap_both (to, to_byte);
5273
5274 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5275 to_byte - from_byte,
5276 !NILP (highest));
5277 }
5278
5279 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5280 1, 2, 0,
5281 "Detect coding system of the text in STRING.\n\
5282 Return a list of possible coding systems ordered by priority.\n\
5283 \n\
5284 If only ASCII characters are found, it returns a list of single element\n\
5285 `undecided' or its subsidiary coding system according to a detected\n\
5286 end-of-line format.\n\
5287 \n\
5288 If optional argument HIGHEST is non-nil, return the coding system of\n\
5289 highest priority.")
5290 (string, highest)
5291 Lisp_Object string, highest;
5292 {
5293 CHECK_STRING (string, 0);
5294
5295 return detect_coding_system (XSTRING (string)->data,
5296 STRING_BYTES (XSTRING (string)),
5297 !NILP (highest));
5298 }
5299
5300 Lisp_Object
5301 code_convert_region1 (start, end, coding_system, encodep)
5302 Lisp_Object start, end, coding_system;
5303 int encodep;
5304 {
5305 struct coding_system coding;
5306 int from, to, len;
5307
5308 CHECK_NUMBER_COERCE_MARKER (start, 0);
5309 CHECK_NUMBER_COERCE_MARKER (end, 1);
5310 CHECK_SYMBOL (coding_system, 2);
5311
5312 validate_region (&start, &end);
5313 from = XFASTINT (start);
5314 to = XFASTINT (end);
5315
5316 if (NILP (coding_system))
5317 return make_number (to - from);
5318
5319 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5320 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5321
5322 coding.mode |= CODING_MODE_LAST_BLOCK;
5323 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5324 &coding, encodep, 1);
5325 Vlast_coding_system_used = coding.symbol;
5326 return make_number (coding.produced_char);
5327 }
5328
5329 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5330 3, 3, "r\nzCoding system: ",
5331 "Decode the current region by specified coding system.\n\
5332 When called from a program, takes three arguments:\n\
5333 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5334 This function sets `last-coding-system-used' to the precise coding system\n\
5335 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5336 not fully specified.)\n\
5337 It returns the length of the decoded text.")
5338 (start, end, coding_system)
5339 Lisp_Object start, end, coding_system;
5340 {
5341 return code_convert_region1 (start, end, coding_system, 0);
5342 }
5343
5344 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5345 3, 3, "r\nzCoding system: ",
5346 "Encode the current region by specified coding system.\n\
5347 When called from a program, takes three arguments:\n\
5348 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5349 This function sets `last-coding-system-used' to the precise coding system\n\
5350 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5351 not fully specified.)\n\
5352 It returns the length of the encoded text.")
5353 (start, end, coding_system)
5354 Lisp_Object start, end, coding_system;
5355 {
5356 return code_convert_region1 (start, end, coding_system, 1);
5357 }
5358
5359 Lisp_Object
5360 code_convert_string1 (string, coding_system, nocopy, encodep)
5361 Lisp_Object string, coding_system, nocopy;
5362 int encodep;
5363 {
5364 struct coding_system coding;
5365
5366 CHECK_STRING (string, 0);
5367 CHECK_SYMBOL (coding_system, 1);
5368
5369 if (NILP (coding_system))
5370 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5371
5372 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5373 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5374
5375 coding.mode |= CODING_MODE_LAST_BLOCK;
5376 string = code_convert_string (string, &coding, encodep, !NILP (nocopy));
5377 Vlast_coding_system_used = coding.symbol;
5378
5379 return string;
5380 }
5381
5382 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5383 2, 3, 0,
5384 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5385 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5386 if the decoding operation is trivial.\n\
5387 This function sets `last-coding-system-used' to the precise coding system\n\
5388 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5389 not fully specified.)")
5390 (string, coding_system, nocopy)
5391 Lisp_Object string, coding_system, nocopy;
5392 {
5393 return code_convert_string1 (string, coding_system, nocopy, 0);
5394 }
5395
5396 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5397 2, 3, 0,
5398 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5399 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5400 if the encoding operation is trivial.\n\
5401 This function sets `last-coding-system-used' to the precise coding system\n\
5402 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5403 not fully specified.)")
5404 (string, coding_system, nocopy)
5405 Lisp_Object string, coding_system, nocopy;
5406 {
5407 return code_convert_string1 (string, coding_system, nocopy, 1);
5408 }
5409
5410 /* Encode or decode STRING according to CODING_SYSTEM.
5411 Do not set Vlast_coding_system_used.
5412
5413 This function is called only from macros DECODE_FILE and
5414 ENCODE_FILE, thus we ignore character composition. */
5415
5416 Lisp_Object
5417 code_convert_string_norecord (string, coding_system, encodep)
5418 Lisp_Object string, coding_system;
5419 int encodep;
5420 {
5421 struct coding_system coding;
5422
5423 CHECK_STRING (string, 0);
5424 CHECK_SYMBOL (coding_system, 1);
5425
5426 if (NILP (coding_system))
5427 return string;
5428
5429 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5430 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5431
5432 coding.composing = COMPOSITION_DISABLED;
5433 coding.mode |= CODING_MODE_LAST_BLOCK;
5434 return code_convert_string (string, &coding, encodep, Qt);
5435 }
5436 \f
5437 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5438 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5439 Return the corresponding character.")
5440 (code)
5441 Lisp_Object code;
5442 {
5443 unsigned char c1, c2, s1, s2;
5444 Lisp_Object val;
5445
5446 CHECK_NUMBER (code, 0);
5447 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5448 if (s1 == 0)
5449 {
5450 if (s2 < 0x80)
5451 XSETFASTINT (val, s2);
5452 else if (s2 >= 0xA0 || s2 <= 0xDF)
5453 XSETFASTINT (val,
5454 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5455 else
5456 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5457 }
5458 else
5459 {
5460 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5461 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5462 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5463 DECODE_SJIS (s1, s2, c1, c2);
5464 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5465 }
5466 return val;
5467 }
5468
5469 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5470 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5471 Return the corresponding code in SJIS.")
5472 (ch)
5473 Lisp_Object ch;
5474 {
5475 int charset, c1, c2, s1, s2;
5476 Lisp_Object val;
5477
5478 CHECK_NUMBER (ch, 0);
5479 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5480 if (charset == CHARSET_ASCII)
5481 {
5482 val = ch;
5483 }
5484 else if (charset == charset_jisx0208
5485 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5486 {
5487 ENCODE_SJIS (c1, c2, s1, s2);
5488 XSETFASTINT (val, (s1 << 8) | s2);
5489 }
5490 else if (charset == charset_katakana_jisx0201
5491 && c1 > 0x20 && c2 < 0xE0)
5492 {
5493 XSETFASTINT (val, c1 | 0x80);
5494 }
5495 else
5496 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5497 return val;
5498 }
5499
5500 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5501 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5502 Return the corresponding character.")
5503 (code)
5504 Lisp_Object code;
5505 {
5506 int charset;
5507 unsigned char b1, b2, c1, c2;
5508 Lisp_Object val;
5509
5510 CHECK_NUMBER (code, 0);
5511 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5512 if (b1 == 0)
5513 {
5514 if (b2 >= 0x80)
5515 error ("Invalid BIG5 code: %x", XFASTINT (code));
5516 val = code;
5517 }
5518 else
5519 {
5520 if ((b1 < 0xA1 || b1 > 0xFE)
5521 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5522 error ("Invalid BIG5 code: %x", XFASTINT (code));
5523 DECODE_BIG5 (b1, b2, charset, c1, c2);
5524 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5525 }
5526 return val;
5527 }
5528
5529 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5530 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5531 Return the corresponding character code in Big5.")
5532 (ch)
5533 Lisp_Object ch;
5534 {
5535 int charset, c1, c2, b1, b2;
5536 Lisp_Object val;
5537
5538 CHECK_NUMBER (ch, 0);
5539 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5540 if (charset == CHARSET_ASCII)
5541 {
5542 val = ch;
5543 }
5544 else if ((charset == charset_big5_1
5545 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5546 || (charset == charset_big5_2
5547 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5548 {
5549 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5550 XSETFASTINT (val, (b1 << 8) | b2);
5551 }
5552 else
5553 error ("Can't encode to Big5: %d", XFASTINT (ch));
5554 return val;
5555 }
5556 \f
5557 DEFUN ("set-terminal-coding-system-internal",
5558 Fset_terminal_coding_system_internal,
5559 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5560 (coding_system)
5561 Lisp_Object coding_system;
5562 {
5563 CHECK_SYMBOL (coding_system, 0);
5564 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5565 /* We had better not send unsafe characters to terminal. */
5566 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5567 /* Characer composition should be disabled. */
5568 terminal_coding.composing = COMPOSITION_DISABLED;
5569 return Qnil;
5570 }
5571
5572 DEFUN ("set-safe-terminal-coding-system-internal",
5573 Fset_safe_terminal_coding_system_internal,
5574 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5575 (coding_system)
5576 Lisp_Object coding_system;
5577 {
5578 CHECK_SYMBOL (coding_system, 0);
5579 setup_coding_system (Fcheck_coding_system (coding_system),
5580 &safe_terminal_coding);
5581 /* Characer composition should be disabled. */
5582 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5583 return Qnil;
5584 }
5585
5586 DEFUN ("terminal-coding-system",
5587 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5588 "Return coding system specified for terminal output.")
5589 ()
5590 {
5591 return terminal_coding.symbol;
5592 }
5593
5594 DEFUN ("set-keyboard-coding-system-internal",
5595 Fset_keyboard_coding_system_internal,
5596 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5597 (coding_system)
5598 Lisp_Object coding_system;
5599 {
5600 CHECK_SYMBOL (coding_system, 0);
5601 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5602 /* Characer composition should be disabled. */
5603 keyboard_coding.composing = COMPOSITION_DISABLED;
5604 return Qnil;
5605 }
5606
5607 DEFUN ("keyboard-coding-system",
5608 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5609 "Return coding system specified for decoding keyboard input.")
5610 ()
5611 {
5612 return keyboard_coding.symbol;
5613 }
5614
5615 \f
5616 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5617 Sfind_operation_coding_system, 1, MANY, 0,
5618 "Choose a coding system for an operation based on the target name.\n\
5619 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5620 DECODING-SYSTEM is the coding system to use for decoding\n\
5621 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5622 for encoding (in case OPERATION does encoding).\n\
5623 \n\
5624 The first argument OPERATION specifies an I/O primitive:\n\
5625 For file I/O, `insert-file-contents' or `write-region'.\n\
5626 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5627 For network I/O, `open-network-stream'.\n\
5628 \n\
5629 The remaining arguments should be the same arguments that were passed\n\
5630 to the primitive. Depending on which primitive, one of those arguments\n\
5631 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5632 whichever argument specifies the file name is TARGET.\n\
5633 \n\
5634 TARGET has a meaning which depends on OPERATION:\n\
5635 For file I/O, TARGET is a file name.\n\
5636 For process I/O, TARGET is a process name.\n\
5637 For network I/O, TARGET is a service name or a port number\n\
5638 \n\
5639 This function looks up what specified for TARGET in,\n\
5640 `file-coding-system-alist', `process-coding-system-alist',\n\
5641 or `network-coding-system-alist' depending on OPERATION.\n\
5642 They may specify a coding system, a cons of coding systems,\n\
5643 or a function symbol to call.\n\
5644 In the last case, we call the function with one argument,\n\
5645 which is a list of all the arguments given to this function.")
5646 (nargs, args)
5647 int nargs;
5648 Lisp_Object *args;
5649 {
5650 Lisp_Object operation, target_idx, target, val;
5651 register Lisp_Object chain;
5652
5653 if (nargs < 2)
5654 error ("Too few arguments");
5655 operation = args[0];
5656 if (!SYMBOLP (operation)
5657 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5658 error ("Invalid first arguement");
5659 if (nargs < 1 + XINT (target_idx))
5660 error ("Too few arguments for operation: %s",
5661 XSYMBOL (operation)->name->data);
5662 target = args[XINT (target_idx) + 1];
5663 if (!(STRINGP (target)
5664 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5665 error ("Invalid %dth argument", XINT (target_idx) + 1);
5666
5667 chain = ((EQ (operation, Qinsert_file_contents)
5668 || EQ (operation, Qwrite_region))
5669 ? Vfile_coding_system_alist
5670 : (EQ (operation, Qopen_network_stream)
5671 ? Vnetwork_coding_system_alist
5672 : Vprocess_coding_system_alist));
5673 if (NILP (chain))
5674 return Qnil;
5675
5676 for (; CONSP (chain); chain = XCDR (chain))
5677 {
5678 Lisp_Object elt;
5679 elt = XCAR (chain);
5680
5681 if (CONSP (elt)
5682 && ((STRINGP (target)
5683 && STRINGP (XCAR (elt))
5684 && fast_string_match (XCAR (elt), target) >= 0)
5685 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5686 {
5687 val = XCDR (elt);
5688 /* Here, if VAL is both a valid coding system and a valid
5689 function symbol, we return VAL as a coding system. */
5690 if (CONSP (val))
5691 return val;
5692 if (! SYMBOLP (val))
5693 return Qnil;
5694 if (! NILP (Fcoding_system_p (val)))
5695 return Fcons (val, val);
5696 if (! NILP (Ffboundp (val)))
5697 {
5698 val = call1 (val, Flist (nargs, args));
5699 if (CONSP (val))
5700 return val;
5701 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5702 return Fcons (val, val);
5703 }
5704 return Qnil;
5705 }
5706 }
5707 return Qnil;
5708 }
5709
5710 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5711 Supdate_coding_systems_internal, 0, 0, 0,
5712 "Update internal database for ISO2022 and CCL based coding systems.\n\
5713 When values of the following coding categories are changed, you must\n\
5714 call this function:\n\
5715 coding-category-iso-7, coding-category-iso-7-tight,\n\
5716 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5717 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5718 coding-category-ccl")
5719 ()
5720 {
5721 int i;
5722
5723 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5724 {
5725 Lisp_Object val;
5726
5727 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5728 if (!NILP (val))
5729 {
5730 if (! coding_system_table[i])
5731 coding_system_table[i] = ((struct coding_system *)
5732 xmalloc (sizeof (struct coding_system)));
5733 setup_coding_system (val, coding_system_table[i]);
5734 }
5735 else if (coding_system_table[i])
5736 {
5737 xfree (coding_system_table[i]);
5738 coding_system_table[i] = NULL;
5739 }
5740 }
5741
5742 return Qnil;
5743 }
5744
5745 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5746 Sset_coding_priority_internal, 0, 0, 0,
5747 "Update internal database for the current value of `coding-category-list'.\n\
5748 This function is internal use only.")
5749 ()
5750 {
5751 int i = 0, idx;
5752 Lisp_Object val;
5753
5754 val = Vcoding_category_list;
5755
5756 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5757 {
5758 if (! SYMBOLP (XCAR (val)))
5759 break;
5760 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5761 if (idx >= CODING_CATEGORY_IDX_MAX)
5762 break;
5763 coding_priorities[i++] = (1 << idx);
5764 val = XCDR (val);
5765 }
5766 /* If coding-category-list is valid and contains all coding
5767 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5768 the following code saves Emacs from craching. */
5769 while (i < CODING_CATEGORY_IDX_MAX)
5770 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5771
5772 return Qnil;
5773 }
5774
5775 #endif /* emacs */
5776
5777 \f
5778 /*** 9. Post-amble ***/
5779
5780 void
5781 init_coding ()
5782 {
5783 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5784 }
5785
5786 void
5787 init_coding_once ()
5788 {
5789 int i;
5790
5791 /* Emacs' internal format specific initialize routine. */
5792 for (i = 0; i <= 0x20; i++)
5793 emacs_code_class[i] = EMACS_control_code;
5794 emacs_code_class[0x0A] = EMACS_linefeed_code;
5795 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5796 for (i = 0x21 ; i < 0x7F; i++)
5797 emacs_code_class[i] = EMACS_ascii_code;
5798 emacs_code_class[0x7F] = EMACS_control_code;
5799 for (i = 0x80; i < 0xFF; i++)
5800 emacs_code_class[i] = EMACS_invalid_code;
5801 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5802 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5803 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5804 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5805
5806 /* ISO2022 specific initialize routine. */
5807 for (i = 0; i < 0x20; i++)
5808 iso_code_class[i] = ISO_control_code;
5809 for (i = 0x21; i < 0x7F; i++)
5810 iso_code_class[i] = ISO_graphic_plane_0;
5811 for (i = 0x80; i < 0xA0; i++)
5812 iso_code_class[i] = ISO_control_code;
5813 for (i = 0xA1; i < 0xFF; i++)
5814 iso_code_class[i] = ISO_graphic_plane_1;
5815 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5816 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5817 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5818 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5819 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5820 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5821 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5822 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5823 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5824 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5825
5826 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5827
5828 setup_coding_system (Qnil, &keyboard_coding);
5829 setup_coding_system (Qnil, &terminal_coding);
5830 setup_coding_system (Qnil, &safe_terminal_coding);
5831 setup_coding_system (Qnil, &default_buffer_file_coding);
5832
5833 bzero (coding_system_table, sizeof coding_system_table);
5834
5835 bzero (ascii_skip_code, sizeof ascii_skip_code);
5836 for (i = 0; i < 128; i++)
5837 ascii_skip_code[i] = 1;
5838
5839 #if defined (MSDOS) || defined (WINDOWSNT)
5840 system_eol_type = CODING_EOL_CRLF;
5841 #else
5842 system_eol_type = CODING_EOL_LF;
5843 #endif
5844
5845 inhibit_pre_post_conversion = 0;
5846 }
5847
5848 #ifdef emacs
5849
5850 void
5851 syms_of_coding ()
5852 {
5853 Qtarget_idx = intern ("target-idx");
5854 staticpro (&Qtarget_idx);
5855
5856 Qcoding_system_history = intern ("coding-system-history");
5857 staticpro (&Qcoding_system_history);
5858 Fset (Qcoding_system_history, Qnil);
5859
5860 /* Target FILENAME is the first argument. */
5861 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5862 /* Target FILENAME is the third argument. */
5863 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5864
5865 Qcall_process = intern ("call-process");
5866 staticpro (&Qcall_process);
5867 /* Target PROGRAM is the first argument. */
5868 Fput (Qcall_process, Qtarget_idx, make_number (0));
5869
5870 Qcall_process_region = intern ("call-process-region");
5871 staticpro (&Qcall_process_region);
5872 /* Target PROGRAM is the third argument. */
5873 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5874
5875 Qstart_process = intern ("start-process");
5876 staticpro (&Qstart_process);
5877 /* Target PROGRAM is the third argument. */
5878 Fput (Qstart_process, Qtarget_idx, make_number (2));
5879
5880 Qopen_network_stream = intern ("open-network-stream");
5881 staticpro (&Qopen_network_stream);
5882 /* Target SERVICE is the fourth argument. */
5883 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5884
5885 Qcoding_system = intern ("coding-system");
5886 staticpro (&Qcoding_system);
5887
5888 Qeol_type = intern ("eol-type");
5889 staticpro (&Qeol_type);
5890
5891 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5892 staticpro (&Qbuffer_file_coding_system);
5893
5894 Qpost_read_conversion = intern ("post-read-conversion");
5895 staticpro (&Qpost_read_conversion);
5896
5897 Qpre_write_conversion = intern ("pre-write-conversion");
5898 staticpro (&Qpre_write_conversion);
5899
5900 Qno_conversion = intern ("no-conversion");
5901 staticpro (&Qno_conversion);
5902
5903 Qundecided = intern ("undecided");
5904 staticpro (&Qundecided);
5905
5906 Qcoding_system_p = intern ("coding-system-p");
5907 staticpro (&Qcoding_system_p);
5908
5909 Qcoding_system_error = intern ("coding-system-error");
5910 staticpro (&Qcoding_system_error);
5911
5912 Fput (Qcoding_system_error, Qerror_conditions,
5913 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5914 Fput (Qcoding_system_error, Qerror_message,
5915 build_string ("Invalid coding system"));
5916
5917 Qcoding_category = intern ("coding-category");
5918 staticpro (&Qcoding_category);
5919 Qcoding_category_index = intern ("coding-category-index");
5920 staticpro (&Qcoding_category_index);
5921
5922 Vcoding_category_table
5923 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5924 staticpro (&Vcoding_category_table);
5925 {
5926 int i;
5927 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5928 {
5929 XVECTOR (Vcoding_category_table)->contents[i]
5930 = intern (coding_category_name[i]);
5931 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5932 Qcoding_category_index, make_number (i));
5933 }
5934 }
5935
5936 Qtranslation_table = intern ("translation-table");
5937 staticpro (&Qtranslation_table);
5938 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5939
5940 Qtranslation_table_id = intern ("translation-table-id");
5941 staticpro (&Qtranslation_table_id);
5942
5943 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5944 staticpro (&Qtranslation_table_for_decode);
5945
5946 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5947 staticpro (&Qtranslation_table_for_encode);
5948
5949 Qsafe_charsets = intern ("safe-charsets");
5950 staticpro (&Qsafe_charsets);
5951
5952 Qvalid_codes = intern ("valid-codes");
5953 staticpro (&Qvalid_codes);
5954
5955 Qemacs_mule = intern ("emacs-mule");
5956 staticpro (&Qemacs_mule);
5957
5958 Qraw_text = intern ("raw-text");
5959 staticpro (&Qraw_text);
5960
5961 defsubr (&Scoding_system_p);
5962 defsubr (&Sread_coding_system);
5963 defsubr (&Sread_non_nil_coding_system);
5964 defsubr (&Scheck_coding_system);
5965 defsubr (&Sdetect_coding_region);
5966 defsubr (&Sdetect_coding_string);
5967 defsubr (&Sdecode_coding_region);
5968 defsubr (&Sencode_coding_region);
5969 defsubr (&Sdecode_coding_string);
5970 defsubr (&Sencode_coding_string);
5971 defsubr (&Sdecode_sjis_char);
5972 defsubr (&Sencode_sjis_char);
5973 defsubr (&Sdecode_big5_char);
5974 defsubr (&Sencode_big5_char);
5975 defsubr (&Sset_terminal_coding_system_internal);
5976 defsubr (&Sset_safe_terminal_coding_system_internal);
5977 defsubr (&Sterminal_coding_system);
5978 defsubr (&Sset_keyboard_coding_system_internal);
5979 defsubr (&Skeyboard_coding_system);
5980 defsubr (&Sfind_operation_coding_system);
5981 defsubr (&Supdate_coding_systems_internal);
5982 defsubr (&Sset_coding_priority_internal);
5983
5984 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5985 "List of coding systems.\n\
5986 \n\
5987 Do not alter the value of this variable manually. This variable should be\n\
5988 updated by the functions `make-coding-system' and\n\
5989 `define-coding-system-alias'.");
5990 Vcoding_system_list = Qnil;
5991
5992 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5993 "Alist of coding system names.\n\
5994 Each element is one element list of coding system name.\n\
5995 This variable is given to `completing-read' as TABLE argument.\n\
5996 \n\
5997 Do not alter the value of this variable manually. This variable should be\n\
5998 updated by the functions `make-coding-system' and\n\
5999 `define-coding-system-alias'.");
6000 Vcoding_system_alist = Qnil;
6001
6002 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6003 "List of coding-categories (symbols) ordered by priority.");
6004 {
6005 int i;
6006
6007 Vcoding_category_list = Qnil;
6008 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6009 Vcoding_category_list
6010 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6011 Vcoding_category_list);
6012 }
6013
6014 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6015 "Specify the coding system for read operations.\n\
6016 It is useful to bind this variable with `let', but do not set it globally.\n\
6017 If the value is a coding system, it is used for decoding on read operation.\n\
6018 If not, an appropriate element is used from one of the coding system alists:\n\
6019 There are three such tables, `file-coding-system-alist',\n\
6020 `process-coding-system-alist', and `network-coding-system-alist'.");
6021 Vcoding_system_for_read = Qnil;
6022
6023 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6024 "Specify the coding system for write operations.\n\
6025 Programs bind this variable with `let', but you should not set it globally.\n\
6026 If the value is a coding system, it is used for encoding of output,\n\
6027 when writing it to a file and when sending it to a file or subprocess.\n\
6028 \n\
6029 If this does not specify a coding system, an appropriate element\n\
6030 is used from one of the coding system alists:\n\
6031 There are three such tables, `file-coding-system-alist',\n\
6032 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6033 For output to files, if the above procedure does not specify a coding system,\n\
6034 the value of `buffer-file-coding-system' is used.");
6035 Vcoding_system_for_write = Qnil;
6036
6037 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6038 "Coding system used in the latest file or process I/O.");
6039 Vlast_coding_system_used = Qnil;
6040
6041 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6042 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6043 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6044 such conversion.");
6045 inhibit_eol_conversion = 0;
6046
6047 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6048 "Non-nil means process buffer inherits coding system of process output.\n\
6049 Bind it to t if the process output is to be treated as if it were a file\n\
6050 read from some filesystem.");
6051 inherit_process_coding_system = 0;
6052
6053 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6054 "Alist to decide a coding system to use for a file I/O operation.\n\
6055 The format is ((PATTERN . VAL) ...),\n\
6056 where PATTERN is a regular expression matching a file name,\n\
6057 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6058 If VAL is a coding system, it is used for both decoding and encoding\n\
6059 the file contents.\n\
6060 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6061 and the cdr part is used for encoding.\n\
6062 If VAL is a function symbol, the function must return a coding system\n\
6063 or a cons of coding systems which are used as above.\n\
6064 \n\
6065 See also the function `find-operation-coding-system'\n\
6066 and the variable `auto-coding-alist'.");
6067 Vfile_coding_system_alist = Qnil;
6068
6069 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6070 "Alist to decide a coding system to use for a process I/O operation.\n\
6071 The format is ((PATTERN . VAL) ...),\n\
6072 where PATTERN is a regular expression matching a program name,\n\
6073 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6074 If VAL is a coding system, it is used for both decoding what received\n\
6075 from the program and encoding what sent to the program.\n\
6076 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6077 and the cdr part is used for encoding.\n\
6078 If VAL is a function symbol, the function must return a coding system\n\
6079 or a cons of coding systems which are used as above.\n\
6080 \n\
6081 See also the function `find-operation-coding-system'.");
6082 Vprocess_coding_system_alist = Qnil;
6083
6084 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6085 "Alist to decide a coding system to use for a network I/O operation.\n\
6086 The format is ((PATTERN . VAL) ...),\n\
6087 where PATTERN is a regular expression matching a network service name\n\
6088 or is a port number to connect to,\n\
6089 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6090 If VAL is a coding system, it is used for both decoding what received\n\
6091 from the network stream and encoding what sent to the network stream.\n\
6092 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6093 and the cdr part is used for encoding.\n\
6094 If VAL is a function symbol, the function must return a coding system\n\
6095 or a cons of coding systems which are used as above.\n\
6096 \n\
6097 See also the function `find-operation-coding-system'.");
6098 Vnetwork_coding_system_alist = Qnil;
6099
6100 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6101 "Coding system to use with system messages.");
6102 Vlocale_coding_system = Qnil;
6103
6104 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6105 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6106 eol_mnemonic_unix = build_string (":");
6107
6108 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6109 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6110 eol_mnemonic_dos = build_string ("\\");
6111
6112 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6113 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6114 eol_mnemonic_mac = build_string ("/");
6115
6116 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6117 "*String displayed in mode line when end-of-line format is not yet determined.");
6118 eol_mnemonic_undecided = build_string (":");
6119
6120 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6121 "*Non-nil enables character translation while encoding and decoding.");
6122 Venable_character_translation = Qt;
6123
6124 DEFVAR_LISP ("standard-translation-table-for-decode",
6125 &Vstandard_translation_table_for_decode,
6126 "Table for translating characters while decoding.");
6127 Vstandard_translation_table_for_decode = Qnil;
6128
6129 DEFVAR_LISP ("standard-translation-table-for-encode",
6130 &Vstandard_translation_table_for_encode,
6131 "Table for translationg characters while encoding.");
6132 Vstandard_translation_table_for_encode = Qnil;
6133
6134 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6135 "Alist of charsets vs revision numbers.\n\
6136 While encoding, if a charset (car part of an element) is found,\n\
6137 designate it with the escape sequence identifing revision (cdr part of the element).");
6138 Vcharset_revision_alist = Qnil;
6139
6140 DEFVAR_LISP ("default-process-coding-system",
6141 &Vdefault_process_coding_system,
6142 "Cons of coding systems used for process I/O by default.\n\
6143 The car part is used for decoding a process output,\n\
6144 the cdr part is used for encoding a text to be sent to a process.");
6145 Vdefault_process_coding_system = Qnil;
6146
6147 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6148 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6149 This is a vector of length 256.\n\
6150 If Nth element is non-nil, the existence of code N in a file\n\
6151 \(or output of subprocess) doesn't prevent it to be detected as\n\
6152 a coding system of ISO 2022 variant which has a flag\n\
6153 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6154 or reading output of a subprocess.\n\
6155 Only 128th through 159th elements has a meaning.");
6156 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6157
6158 DEFVAR_LISP ("select-safe-coding-system-function",
6159 &Vselect_safe_coding_system_function,
6160 "Function to call to select safe coding system for encoding a text.\n\
6161 \n\
6162 If set, this function is called to force a user to select a proper\n\
6163 coding system which can encode the text in the case that a default\n\
6164 coding system used in each operation can't encode the text.\n\
6165 \n\
6166 The default value is `select-safe-coding-system' (which see).");
6167 Vselect_safe_coding_system_function = Qnil;
6168
6169 }
6170
6171 char *
6172 emacs_strerror (error_number)
6173 int error_number;
6174 {
6175 char *str;
6176
6177 synchronize_system_messages_locale ();
6178 str = strerror (error_number);
6179
6180 if (! NILP (Vlocale_coding_system))
6181 {
6182 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6183 Vlocale_coding_system,
6184 0);
6185 str = (char *) XSTRING (dec)->data;
6186 }
6187
6188 return str;
6189 }
6190
6191 #endif /* emacs */