]> code.delx.au - gnu-emacs/blob - src/coding.c
Remove eval-when-compile.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
34
35 */
36
37 /*** 0. General comments ***/
38
39
40 /*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
48
49 0. Emacs' internal format (emacs-mule)
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
74
75 4. Raw text
76
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
79
80 5. Other
81
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
91
92 */
93
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
101
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
106
107 */
108
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
117 int
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
120 {
121 ...
122 }
123 #endif
124
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
131
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 {
149 ...
150 }
151 #endif
152
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
159
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
164
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
168
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
176 {
177 ...
178 }
179 #endif
180
181 /*** COMMONLY USED MACROS ***/
182
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
190
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
199 } while (0)
200
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
211
212
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
222
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
241
242
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
268 \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
276 } while (0)
277
278
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
309
310 \f
311 /*** 1. Preamble ***/
312
313 #ifdef emacs
314 #include <config.h>
315 #endif
316
317 #include <stdio.h>
318
319 #ifdef emacs
320
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
328
329 #else /* not emacs */
330
331 #include "mulelib.h"
332
333 #endif /* not emacs */
334
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_charsets;
341 Lisp_Object Qvalid_codes;
342
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
347
348 Lisp_Object Vselect_safe_coding_system_function;
349
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
355
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
359
360 #ifdef emacs
361
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
365
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
369
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
378
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
382
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
385
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
388
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding;
391
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding;
395
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding;
398
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding;
401
402 Lisp_Object Vfile_coding_system_alist;
403 Lisp_Object Vprocess_coding_system_alist;
404 Lisp_Object Vnetwork_coding_system_alist;
405
406 Lisp_Object Vlocale_coding_system;
407
408 #endif /* emacs */
409
410 Lisp_Object Qcoding_category, Qcoding_category_index;
411
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list;
414
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table;
417
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
435 };
436
437 /* Table of pointers to coding systems corresponding to each coding
438 categories. */
439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443 static
444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
446 /* Flag to tell if we look up translation table on character code
447 conversion. */
448 Lisp_Object Venable_character_translation;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode;
453
454 Lisp_Object Qtranslation_table;
455 Lisp_Object Qtranslation_table_id;
456 Lisp_Object Qtranslation_table_for_decode;
457 Lisp_Object Qtranslation_table_for_encode;
458
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist;
461
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system;
464
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion;
470
471 \f
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
473
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
496
497 --- CODE RANGE of Emacs' internal format ---
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
505
506 */
507
508 enum emacs_code_class_type emacs_code_class[256];
509
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
513
514 int
515 detect_coding_emacs_mule (src, src_end)
516 unsigned char *src, *src_end;
517 {
518 unsigned char c;
519 int composing = 0;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
523
524 while (1)
525 {
526 ONE_MORE_BYTE (c);
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
537 else
538 c -= 0x20;
539 }
540
541 if (c < 0x20)
542 {
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
555
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565 }
566
567
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
569
570 static void
571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575 {
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
585
586 coding->produced_char = 0;
587 while (src < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
591
592 src_base = src;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
594 {
595 p = src;
596 src += bytes;
597 }
598 else
599 {
600 bytes = CHAR_STRING (*src, tmp);
601 p = tmp;
602 src++;
603 }
604 if (dst + bytes >= (dst_bytes ? dst_end : src))
605 {
606 coding->result = CODING_FINISH_INSUFFICIENT_DST;
607 break;
608 }
609 while (bytes--) *dst++ = *p++;
610 coding->produced_char++;
611 }
612 coding->consumed = coding->consumed_char = src_base - source;
613 coding->produced = dst - destination;
614 }
615
616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
618
619
620 \f
621 /*** 3. ISO2022 handlers ***/
622
623 /* The following note describes the coding system ISO2022 briefly.
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
627 original document of ISO2022.
628
629 ISO2022 provides many mechanisms to encode several character sets
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
634
635 There are two kinds of character sets: control character set and
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
640 two control character sets and many graphic character sets.
641
642 Graphic character sets are classified into one of the following
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
645 - DIMENSION1_CHARS94
646 - DIMENSION1_CHARS96
647 - DIMENSION2_CHARS94
648 - DIMENSION2_CHARS96
649
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
655
656 Note (*): ECMA = European Computer Manufacturers Association
657
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
663
664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
669
670 A control character set is directly designated and invoked to C0 or
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
679
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
687
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
691 be used.
692
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
698 escape sequences:
699
700 ----------------------------------------------------------------------
701 abbrev function cntrl escape seq description
702 ----------------------------------------------------------------------
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
712 ----------------------------------------------------------------------
713 (*) These are not used by any known coding system.
714
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
717
718 Designations are done by the following escape sequences:
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
739
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
741 of dimension 1, chars 94, and final character <F>, etc...
742
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
745 CHARS96 character sets in a coding system which is characterized as
746 7-bit environment, non-locking-shift, and non-single-shift.
747
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
749 '(' can be omitted. We refer to this as "short-form" hereafter.
750
751 Now you may notice that there are a lot of ways for encoding the
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
756 localized platforms), and all of these are variants of ISO2022.
757
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
761
762 ISO6429's direction specification takes the following form:
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
769
770 Character composition specification takes the following form:
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
778
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
783
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
786
787 COMPOSITION_RELATIVE:
788 ESC 0 CHAR [ CHAR ] ESC 1
789 COMPOSITOIN_WITH_RULE:
790 ESC 2 CHAR [ RULE CHAR ] ESC 1
791 COMPOSITION_WITH_ALTCHARS:
792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
793 COMPOSITION_WITH_RULE_ALTCHARS:
794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
795
796 enum iso_code_class_type iso_code_class[256];
797
798 #define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
804
805 #define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
807
808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
812 CODING_CATEGORY_MASK_ISO_7_TIGHT
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
817 are set. If a code which should never appear in ISO2022 is found,
818 returns 0. */
819
820 int
821 detect_coding_iso2022 (src, src_end)
822 unsigned char *src, *src_end;
823 {
824 int mask = CODING_CATEGORY_MASK_ISO;
825 int mask_found = 0;
826 int reg[4], shift_out = 0, single_shifting = 0;
827 int c, c1, i, charset;
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding;
830 struct coding_system *coding = &dummy_coding;
831
832 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
833 while (mask && src < src_end)
834 {
835 ONE_MORE_BYTE (c);
836 switch (c)
837 {
838 case ISO_CODE_ESC:
839 single_shifting = 0;
840 ONE_MORE_BYTE (c);
841 if (c >= '(' && c <= '/')
842 {
843 /* Designation sequence for a charset of dimension 1. */
844 ONE_MORE_BYTE (c1);
845 if (c1 < ' ' || c1 >= 0x80
846 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
847 /* Invalid designation sequence. Just ignore. */
848 break;
849 reg[(c - '(') % 4] = charset;
850 }
851 else if (c == '$')
852 {
853 /* Designation sequence for a charset of dimension 2. */
854 ONE_MORE_BYTE (c);
855 if (c >= '@' && c <= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
857 reg[0] = charset = iso_charset_table[1][0][c];
858 else if (c >= '(' && c <= '/')
859 {
860 ONE_MORE_BYTE (c1);
861 if (c1 < ' ' || c1 >= 0x80
862 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
863 /* Invalid designation sequence. Just ignore. */
864 break;
865 reg[(c - '(') % 4] = charset;
866 }
867 else
868 /* Invalid designation sequence. Just ignore. */
869 break;
870 }
871 else if (c == 'N' || c == 'O')
872 {
873 /* ESC <Fe> for SS2 or SS3. */
874 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
875 break;
876 }
877 else if (c >= '0' && c <= '4')
878 {
879 /* ESC <Fp> for start/end composition. */
880 mask_found |= CODING_CATEGORY_MASK_ISO;
881 break;
882 }
883 else
884 /* Invalid escape sequence. Just ignore. */
885 break;
886
887 /* We found a valid designation sequence for CHARSET. */
888 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
890 mask_found |= CODING_CATEGORY_MASK_ISO_7;
891 else
892 mask &= ~CODING_CATEGORY_MASK_ISO_7;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
894 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
895 else
896 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
898 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
899 else
900 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
902 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
903 else
904 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
905 break;
906
907 case ISO_CODE_SO:
908 single_shifting = 0;
909 if (shift_out == 0
910 && (reg[1] >= 0
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
913 {
914 /* Locking shift out. */
915 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
916 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
917 }
918 break;
919
920 case ISO_CODE_SI:
921 single_shifting = 0;
922 if (shift_out == 1)
923 {
924 /* Locking shift in. */
925 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
926 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
927 }
928 break;
929
930 case ISO_CODE_CSI:
931 single_shifting = 0;
932 case ISO_CODE_SS2:
933 case ISO_CODE_SS3:
934 {
935 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
936
937 if (c != ISO_CODE_CSI)
938 {
939 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT)
941 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
942 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT)
944 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
945 single_shifting = 1;
946 }
947 if (VECTORP (Vlatin_extra_code_table)
948 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
949 {
950 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA)
952 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
953 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA)
955 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
956 }
957 mask &= newmask;
958 mask_found |= newmask;
959 }
960 break;
961
962 default:
963 if (c < 0x80)
964 {
965 single_shifting = 0;
966 break;
967 }
968 else if (c < 0xA0)
969 {
970 single_shifting = 0;
971 if (VECTORP (Vlatin_extra_code_table)
972 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
973 {
974 int newmask = 0;
975
976 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA)
978 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
979 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA)
981 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
982 mask &= newmask;
983 mask_found |= newmask;
984 }
985 else
986 return 0;
987 }
988 else
989 {
990 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
991 | CODING_CATEGORY_MASK_ISO_7_ELSE);
992 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
993 /* Check the length of succeeding codes of the range
994 0xA0..0FF. If the byte length is odd, we exclude
995 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
996 when we are not single shifting. */
997 if (!single_shifting
998 && mask & CODING_CATEGORY_MASK_ISO_8_2)
999 {
1000 int i = 1;
1001 while (src < src_end)
1002 {
1003 ONE_MORE_BYTE (c);
1004 if (c < 0xA0)
1005 break;
1006 i++;
1007 }
1008
1009 if (i & 1 && src < src_end)
1010 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1011 else
1012 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1013 }
1014 }
1015 break;
1016 }
1017 }
1018 label_end_of_loop:
1019 return (mask & mask_found);
1020 }
1021
1022 /* Decode a character of which charset is CHARSET, the 1st position
1023 code is C1, the 2nd position code is C2, and return the decoded
1024 character code. If the variable `translation_table' is non-nil,
1025 returned the translated code. */
1026
1027 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1028 (NILP (translation_table) \
1029 ? MAKE_CHAR (charset, c1, c2) \
1030 : translate_char (translation_table, -1, charset, c1, c2))
1031
1032 /* Set designation state into CODING. */
1033 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1034 do { \
1035 int charset; \
1036 \
1037 if (final_char < '0' || final_char >= 128) \
1038 goto label_invalid_code; \
1039 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1040 make_number (chars), \
1041 make_number (final_char)); \
1042 if (charset >= 0 \
1043 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1044 || coding->safe_charsets[charset])) \
1045 { \
1046 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1047 && reg == 0 \
1048 && charset == CHARSET_ASCII) \
1049 { \
1050 /* We should insert this designation sequence as is so \
1051 that it is surely written back to a file. */ \
1052 coding->spec.iso2022.last_invalid_designation_register = -1; \
1053 goto label_invalid_code; \
1054 } \
1055 coding->spec.iso2022.last_invalid_designation_register = -1; \
1056 if ((coding->mode & CODING_MODE_DIRECTION) \
1057 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1058 charset = CHARSET_REVERSE_CHARSET (charset); \
1059 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1060 } \
1061 else \
1062 { \
1063 coding->spec.iso2022.last_invalid_designation_register = reg; \
1064 goto label_invalid_code; \
1065 } \
1066 } while (0)
1067
1068 /* Allocate a memory block for storing information about compositions.
1069 The block is chained to the already allocated blocks. */
1070
1071 void
1072 coding_allocate_composition_data (coding, char_offset)
1073 struct coding_system *coding;
1074 int char_offset;
1075 {
1076 struct composition_data *cmp_data
1077 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1078
1079 cmp_data->char_offset = char_offset;
1080 cmp_data->used = 0;
1081 cmp_data->prev = coding->cmp_data;
1082 cmp_data->next = NULL;
1083 if (coding->cmp_data)
1084 coding->cmp_data->next = cmp_data;
1085 coding->cmp_data = cmp_data;
1086 coding->cmp_data_start = 0;
1087 }
1088
1089 /* Record the starting position START and METHOD of one composition. */
1090
1091 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1092 do { \
1093 struct composition_data *cmp_data = coding->cmp_data; \
1094 int *data = cmp_data->data + cmp_data->used; \
1095 coding->cmp_data_start = cmp_data->used; \
1096 data[0] = -1; \
1097 data[1] = cmp_data->char_offset + start; \
1098 data[3] = (int) method; \
1099 cmp_data->used += 4; \
1100 } while (0)
1101
1102 /* Record the ending position END of the current composition. */
1103
1104 #define CODING_ADD_COMPOSITION_END(coding, end) \
1105 do { \
1106 struct composition_data *cmp_data = coding->cmp_data; \
1107 int *data = cmp_data->data + coding->cmp_data_start; \
1108 data[0] = cmp_data->used - coding->cmp_data_start; \
1109 data[2] = cmp_data->char_offset + end; \
1110 } while (0)
1111
1112 /* Record one COMPONENT (alternate character or composition rule). */
1113
1114 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1115 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1116
1117 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1118
1119 #define DECODE_COMPOSITION_START(c1) \
1120 do { \
1121 if (coding->composing == COMPOSITION_DISABLED) \
1122 { \
1123 *dst++ = ISO_CODE_ESC; \
1124 *dst++ = c1 & 0x7f; \
1125 coding->produced_char += 2; \
1126 } \
1127 else if (!COMPOSING_P (coding)) \
1128 { \
1129 /* This is surely the start of a composition. We must be sure \
1130 that coding->cmp_data has enough space to store the \
1131 information about the composition. If not, terminate the \
1132 current decoding loop, allocate one more memory block for \
1133 coding->cmp_data in the calller, then start the decoding \
1134 loop again. We can't allocate memory here directly because \
1135 it may cause buffer/string relocation. */ \
1136 if (!coding->cmp_data \
1137 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1138 >= COMPOSITION_DATA_SIZE)) \
1139 { \
1140 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1141 goto label_end_of_loop; \
1142 } \
1143 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1144 : c1 == '2' ? COMPOSITION_WITH_RULE \
1145 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1146 : COMPOSITION_WITH_RULE_ALTCHARS); \
1147 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1148 coding->composing); \
1149 coding->composition_rule_follows = 0; \
1150 } \
1151 else \
1152 { \
1153 /* We are already handling a composition. If the method is \
1154 the following two, the codes following the current escape \
1155 sequence are actual characters stored in a buffer. */ \
1156 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1157 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1158 { \
1159 coding->composing = COMPOSITION_RELATIVE; \
1160 coding->composition_rule_follows = 0; \
1161 } \
1162 } \
1163 } while (0)
1164
1165 /* Handle compositoin end sequence ESC 1. */
1166
1167 #define DECODE_COMPOSITION_END(c1) \
1168 do { \
1169 if (coding->composing == COMPOSITION_DISABLED) \
1170 { \
1171 *dst++ = ISO_CODE_ESC; \
1172 *dst++ = c1; \
1173 coding->produced_char += 2; \
1174 } \
1175 else \
1176 { \
1177 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1178 coding->composing = COMPOSITION_NO; \
1179 } \
1180 } while (0)
1181
1182 /* Decode a composition rule from the byte C1 (and maybe one more byte
1183 from SRC) and store one encoded composition rule in
1184 coding->cmp_data. */
1185
1186 #define DECODE_COMPOSITION_RULE(c1) \
1187 do { \
1188 int rule = 0; \
1189 (c1) -= 32; \
1190 if (c1 < 81) /* old format (before ver.21) */ \
1191 { \
1192 int gref = (c1) / 9; \
1193 int nref = (c1) % 9; \
1194 if (gref == 4) gref = 10; \
1195 if (nref == 4) nref = 10; \
1196 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1197 } \
1198 else if (c1 < 93) /* new format (after ver.21) */ \
1199 { \
1200 ONE_MORE_BYTE (c2); \
1201 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1202 } \
1203 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1204 coding->composition_rule_follows = 0; \
1205 } while (0)
1206
1207
1208 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1209
1210 static void
1211 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1212 struct coding_system *coding;
1213 unsigned char *source, *destination;
1214 int src_bytes, dst_bytes;
1215 {
1216 unsigned char *src = source;
1217 unsigned char *src_end = source + src_bytes;
1218 unsigned char *dst = destination;
1219 unsigned char *dst_end = destination + dst_bytes;
1220 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1221 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1222 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1223 /* SRC_BASE remembers the start position in source in each loop.
1224 The loop will be exited when there's not enough source code
1225 (within macro ONE_MORE_BYTE), or when there's not enough
1226 destination area to produce a character (within macro
1227 EMIT_CHAR). */
1228 unsigned char *src_base;
1229 int c, charset;
1230 Lisp_Object translation_table;
1231
1232 if (NILP (Venable_character_translation))
1233 translation_table = Qnil;
1234 else
1235 {
1236 translation_table = coding->translation_table_for_decode;
1237 if (NILP (translation_table))
1238 translation_table = Vstandard_translation_table_for_decode;
1239 }
1240
1241 coding->result = CODING_FINISH_NORMAL;
1242
1243 while (1)
1244 {
1245 int c1, c2;
1246
1247 src_base = src;
1248 ONE_MORE_BYTE (c1);
1249
1250 /* We produce no character or one character. */
1251 switch (iso_code_class [c1])
1252 {
1253 case ISO_0x20_or_0x7F:
1254 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1255 {
1256 DECODE_COMPOSITION_RULE (c1);
1257 continue;
1258 }
1259 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1260 {
1261 /* This is SPACE or DEL. */
1262 charset = CHARSET_ASCII;
1263 break;
1264 }
1265 /* This is a graphic character, we fall down ... */
1266
1267 case ISO_graphic_plane_0:
1268 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1269 {
1270 DECODE_COMPOSITION_RULE (c1);
1271 continue;
1272 }
1273 charset = charset0;
1274 break;
1275
1276 case ISO_0xA0_or_0xFF:
1277 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1278 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1279 goto label_invalid_code;
1280 /* This is a graphic character, we fall down ... */
1281
1282 case ISO_graphic_plane_1:
1283 if (charset1 < 0)
1284 goto label_invalid_code;
1285 charset = charset1;
1286 break;
1287
1288 case ISO_control_0:
1289 if (COMPOSING_P (coding))
1290 DECODE_COMPOSITION_END ('1');
1291
1292 /* All ISO2022 control characters in this class have the
1293 same representation in Emacs internal format. */
1294 if (c1 == '\n'
1295 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1296 && (coding->eol_type == CODING_EOL_CR
1297 || coding->eol_type == CODING_EOL_CRLF))
1298 {
1299 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1300 goto label_end_of_loop;
1301 }
1302 charset = CHARSET_ASCII;
1303 break;
1304
1305 case ISO_control_1:
1306 if (COMPOSING_P (coding))
1307 DECODE_COMPOSITION_END ('1');
1308 goto label_invalid_code;
1309
1310 case ISO_carriage_return:
1311 if (COMPOSING_P (coding))
1312 DECODE_COMPOSITION_END ('1');
1313
1314 if (coding->eol_type == CODING_EOL_CR)
1315 c1 = '\n';
1316 else if (coding->eol_type == CODING_EOL_CRLF)
1317 {
1318 ONE_MORE_BYTE (c1);
1319 if (c1 != ISO_CODE_LF)
1320 {
1321 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1322 {
1323 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1324 goto label_end_of_loop;
1325 }
1326 src--;
1327 c1 = '\r';
1328 }
1329 }
1330 charset = CHARSET_ASCII;
1331 break;
1332
1333 case ISO_shift_out:
1334 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1335 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1336 goto label_invalid_code;
1337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1338 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1339 continue;
1340
1341 case ISO_shift_in:
1342 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1343 goto label_invalid_code;
1344 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1345 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1346 continue;
1347
1348 case ISO_single_shift_2_7:
1349 case ISO_single_shift_2:
1350 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1351 goto label_invalid_code;
1352 /* SS2 is handled as an escape sequence of ESC 'N' */
1353 c1 = 'N';
1354 goto label_escape_sequence;
1355
1356 case ISO_single_shift_3:
1357 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1358 goto label_invalid_code;
1359 /* SS2 is handled as an escape sequence of ESC 'O' */
1360 c1 = 'O';
1361 goto label_escape_sequence;
1362
1363 case ISO_control_sequence_introducer:
1364 /* CSI is handled as an escape sequence of ESC '[' ... */
1365 c1 = '[';
1366 goto label_escape_sequence;
1367
1368 case ISO_escape:
1369 ONE_MORE_BYTE (c1);
1370 label_escape_sequence:
1371 /* Escape sequences handled by Emacs are invocation,
1372 designation, direction specification, and character
1373 composition specification. */
1374 switch (c1)
1375 {
1376 case '&': /* revision of following character set */
1377 ONE_MORE_BYTE (c1);
1378 if (!(c1 >= '@' && c1 <= '~'))
1379 goto label_invalid_code;
1380 ONE_MORE_BYTE (c1);
1381 if (c1 != ISO_CODE_ESC)
1382 goto label_invalid_code;
1383 ONE_MORE_BYTE (c1);
1384 goto label_escape_sequence;
1385
1386 case '$': /* designation of 2-byte character set */
1387 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1388 goto label_invalid_code;
1389 ONE_MORE_BYTE (c1);
1390 if (c1 >= '@' && c1 <= 'B')
1391 { /* designation of JISX0208.1978, GB2312.1980,
1392 or JISX0208.1980 */
1393 DECODE_DESIGNATION (0, 2, 94, c1);
1394 }
1395 else if (c1 >= 0x28 && c1 <= 0x2B)
1396 { /* designation of DIMENSION2_CHARS94 character set */
1397 ONE_MORE_BYTE (c2);
1398 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1399 }
1400 else if (c1 >= 0x2C && c1 <= 0x2F)
1401 { /* designation of DIMENSION2_CHARS96 character set */
1402 ONE_MORE_BYTE (c2);
1403 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1404 }
1405 else
1406 goto label_invalid_code;
1407 /* We must update these variables now. */
1408 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1409 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1410 continue;
1411
1412 case 'n': /* invocation of locking-shift-2 */
1413 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1414 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1415 goto label_invalid_code;
1416 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1417 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1418 continue;
1419
1420 case 'o': /* invocation of locking-shift-3 */
1421 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1422 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1423 goto label_invalid_code;
1424 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1425 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1426 continue;
1427
1428 case 'N': /* invocation of single-shift-2 */
1429 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1430 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1431 goto label_invalid_code;
1432 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1433 ONE_MORE_BYTE (c1);
1434 break;
1435
1436 case 'O': /* invocation of single-shift-3 */
1437 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1438 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1439 goto label_invalid_code;
1440 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1441 ONE_MORE_BYTE (c1);
1442 break;
1443
1444 case '0': case '2': case '3': case '4': /* start composition */
1445 DECODE_COMPOSITION_START (c1);
1446 continue;
1447
1448 case '1': /* end composition */
1449 DECODE_COMPOSITION_END (c1);
1450 continue;
1451
1452 case '[': /* specification of direction */
1453 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1454 goto label_invalid_code;
1455 /* For the moment, nested direction is not supported.
1456 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1457 left-to-right, and nozero means right-to-left. */
1458 ONE_MORE_BYTE (c1);
1459 switch (c1)
1460 {
1461 case ']': /* end of the current direction */
1462 coding->mode &= ~CODING_MODE_DIRECTION;
1463
1464 case '0': /* end of the current direction */
1465 case '1': /* start of left-to-right direction */
1466 ONE_MORE_BYTE (c1);
1467 if (c1 == ']')
1468 coding->mode &= ~CODING_MODE_DIRECTION;
1469 else
1470 goto label_invalid_code;
1471 break;
1472
1473 case '2': /* start of right-to-left direction */
1474 ONE_MORE_BYTE (c1);
1475 if (c1 == ']')
1476 coding->mode |= CODING_MODE_DIRECTION;
1477 else
1478 goto label_invalid_code;
1479 break;
1480
1481 default:
1482 goto label_invalid_code;
1483 }
1484 continue;
1485
1486 default:
1487 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1488 goto label_invalid_code;
1489 if (c1 >= 0x28 && c1 <= 0x2B)
1490 { /* designation of DIMENSION1_CHARS94 character set */
1491 ONE_MORE_BYTE (c2);
1492 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1493 }
1494 else if (c1 >= 0x2C && c1 <= 0x2F)
1495 { /* designation of DIMENSION1_CHARS96 character set */
1496 ONE_MORE_BYTE (c2);
1497 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1498 }
1499 else
1500 goto label_invalid_code;
1501 /* We must update these variables now. */
1502 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1503 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1504 continue;
1505 }
1506 }
1507
1508 /* Now we know CHARSET and 1st position code C1 of a character.
1509 Produce a multibyte sequence for that character while getting
1510 2nd position code C2 if necessary. */
1511 if (CHARSET_DIMENSION (charset) == 2)
1512 {
1513 ONE_MORE_BYTE (c2);
1514 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1515 /* C2 is not in a valid range. */
1516 goto label_invalid_code;
1517 }
1518 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1519 EMIT_CHAR (c);
1520 continue;
1521
1522 label_invalid_code:
1523 coding->errors++;
1524 if (COMPOSING_P (coding))
1525 DECODE_COMPOSITION_END ('1');
1526 src = src_base;
1527 c = *src++;
1528 EMIT_CHAR (c);
1529 }
1530
1531 label_end_of_loop:
1532 coding->consumed = coding->consumed_char = src_base - source;
1533 coding->produced = dst - destination;
1534 return;
1535 }
1536
1537
1538 /* ISO2022 encoding stuff. */
1539
1540 /*
1541 It is not enough to say just "ISO2022" on encoding, we have to
1542 specify more details. In Emacs, each coding system of ISO2022
1543 variant has the following specifications:
1544 1. Initial designation to G0 thru G3.
1545 2. Allows short-form designation?
1546 3. ASCII should be designated to G0 before control characters?
1547 4. ASCII should be designated to G0 at end of line?
1548 5. 7-bit environment or 8-bit environment?
1549 6. Use locking-shift?
1550 7. Use Single-shift?
1551 And the following two are only for Japanese:
1552 8. Use ASCII in place of JIS0201-1976-Roman?
1553 9. Use JISX0208-1983 in place of JISX0208-1978?
1554 These specifications are encoded in `coding->flags' as flag bits
1555 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1556 details.
1557 */
1558
1559 /* Produce codes (escape sequence) for designating CHARSET to graphic
1560 register REG at DST, and increment DST. If <final-char> of CHARSET is
1561 '@', 'A', or 'B' and the coding system CODING allows, produce
1562 designation sequence of short-form. */
1563
1564 #define ENCODE_DESIGNATION(charset, reg, coding) \
1565 do { \
1566 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1567 char *intermediate_char_94 = "()*+"; \
1568 char *intermediate_char_96 = ",-./"; \
1569 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1570 \
1571 if (revision < 255) \
1572 { \
1573 *dst++ = ISO_CODE_ESC; \
1574 *dst++ = '&'; \
1575 *dst++ = '@' + revision; \
1576 } \
1577 *dst++ = ISO_CODE_ESC; \
1578 if (CHARSET_DIMENSION (charset) == 1) \
1579 { \
1580 if (CHARSET_CHARS (charset) == 94) \
1581 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1582 else \
1583 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1584 } \
1585 else \
1586 { \
1587 *dst++ = '$'; \
1588 if (CHARSET_CHARS (charset) == 94) \
1589 { \
1590 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1591 || reg != 0 \
1592 || final_char < '@' || final_char > 'B') \
1593 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1594 } \
1595 else \
1596 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1597 } \
1598 *dst++ = final_char; \
1599 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1600 } while (0)
1601
1602 /* The following two macros produce codes (control character or escape
1603 sequence) for ISO2022 single-shift functions (single-shift-2 and
1604 single-shift-3). */
1605
1606 #define ENCODE_SINGLE_SHIFT_2 \
1607 do { \
1608 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1609 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1610 else \
1611 *dst++ = ISO_CODE_SS2; \
1612 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1613 } while (0)
1614
1615 #define ENCODE_SINGLE_SHIFT_3 \
1616 do { \
1617 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1618 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1619 else \
1620 *dst++ = ISO_CODE_SS3; \
1621 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1622 } while (0)
1623
1624 /* The following four macros produce codes (control character or
1625 escape sequence) for ISO2022 locking-shift functions (shift-in,
1626 shift-out, locking-shift-2, and locking-shift-3). */
1627
1628 #define ENCODE_SHIFT_IN \
1629 do { \
1630 *dst++ = ISO_CODE_SI; \
1631 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1632 } while (0)
1633
1634 #define ENCODE_SHIFT_OUT \
1635 do { \
1636 *dst++ = ISO_CODE_SO; \
1637 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1638 } while (0)
1639
1640 #define ENCODE_LOCKING_SHIFT_2 \
1641 do { \
1642 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1643 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1644 } while (0)
1645
1646 #define ENCODE_LOCKING_SHIFT_3 \
1647 do { \
1648 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1649 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1650 } while (0)
1651
1652 /* Produce codes for a DIMENSION1 character whose character set is
1653 CHARSET and whose position-code is C1. Designation and invocation
1654 sequences are also produced in advance if necessary. */
1655
1656 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1657 do { \
1658 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1659 { \
1660 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1661 *dst++ = c1 & 0x7F; \
1662 else \
1663 *dst++ = c1 | 0x80; \
1664 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1665 break; \
1666 } \
1667 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1668 { \
1669 *dst++ = c1 & 0x7F; \
1670 break; \
1671 } \
1672 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1673 { \
1674 *dst++ = c1 | 0x80; \
1675 break; \
1676 } \
1677 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1678 && !coding->safe_charsets[charset]) \
1679 { \
1680 /* We should not encode this character, instead produce one or \
1681 two `?'s. */ \
1682 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1683 if (CHARSET_WIDTH (charset) == 2) \
1684 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1685 break; \
1686 } \
1687 else \
1688 /* Since CHARSET is not yet invoked to any graphic planes, we \
1689 must invoke it, or, at first, designate it to some graphic \
1690 register. Then repeat the loop to actually produce the \
1691 character. */ \
1692 dst = encode_invocation_designation (charset, coding, dst); \
1693 } while (1)
1694
1695 /* Produce codes for a DIMENSION2 character whose character set is
1696 CHARSET and whose position-codes are C1 and C2. Designation and
1697 invocation codes are also produced in advance if necessary. */
1698
1699 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1700 do { \
1701 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1702 { \
1703 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1704 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1705 else \
1706 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1707 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1708 break; \
1709 } \
1710 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1711 { \
1712 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1713 break; \
1714 } \
1715 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1716 { \
1717 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1718 break; \
1719 } \
1720 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1721 && !coding->safe_charsets[charset]) \
1722 { \
1723 /* We should not encode this character, instead produce one or \
1724 two `?'s. */ \
1725 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1726 if (CHARSET_WIDTH (charset) == 2) \
1727 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1728 break; \
1729 } \
1730 else \
1731 /* Since CHARSET is not yet invoked to any graphic planes, we \
1732 must invoke it, or, at first, designate it to some graphic \
1733 register. Then repeat the loop to actually produce the \
1734 character. */ \
1735 dst = encode_invocation_designation (charset, coding, dst); \
1736 } while (1)
1737
1738 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1739 do { \
1740 int alt_charset = charset; \
1741 \
1742 if (CHARSET_DEFINED_P (charset)) \
1743 { \
1744 if (CHARSET_DIMENSION (charset) == 1) \
1745 { \
1746 if (charset == CHARSET_ASCII \
1747 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1748 alt_charset = charset_latin_jisx0201; \
1749 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1750 } \
1751 else \
1752 { \
1753 if (charset == charset_jisx0208 \
1754 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1755 alt_charset = charset_jisx0208_1978; \
1756 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1757 } \
1758 } \
1759 else \
1760 { \
1761 *dst++ = c1; \
1762 if (c2 >= 0) \
1763 *dst++ = c2; \
1764 } \
1765 } while (0)
1766
1767 /* Produce designation and invocation codes at a place pointed by DST
1768 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1769 Return new DST. */
1770
1771 unsigned char *
1772 encode_invocation_designation (charset, coding, dst)
1773 int charset;
1774 struct coding_system *coding;
1775 unsigned char *dst;
1776 {
1777 int reg; /* graphic register number */
1778
1779 /* At first, check designations. */
1780 for (reg = 0; reg < 4; reg++)
1781 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1782 break;
1783
1784 if (reg >= 4)
1785 {
1786 /* CHARSET is not yet designated to any graphic registers. */
1787 /* At first check the requested designation. */
1788 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1789 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1790 /* Since CHARSET requests no special designation, designate it
1791 to graphic register 0. */
1792 reg = 0;
1793
1794 ENCODE_DESIGNATION (charset, reg, coding);
1795 }
1796
1797 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1798 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1799 {
1800 /* Since the graphic register REG is not invoked to any graphic
1801 planes, invoke it to graphic plane 0. */
1802 switch (reg)
1803 {
1804 case 0: /* graphic register 0 */
1805 ENCODE_SHIFT_IN;
1806 break;
1807
1808 case 1: /* graphic register 1 */
1809 ENCODE_SHIFT_OUT;
1810 break;
1811
1812 case 2: /* graphic register 2 */
1813 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1814 ENCODE_SINGLE_SHIFT_2;
1815 else
1816 ENCODE_LOCKING_SHIFT_2;
1817 break;
1818
1819 case 3: /* graphic register 3 */
1820 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1821 ENCODE_SINGLE_SHIFT_3;
1822 else
1823 ENCODE_LOCKING_SHIFT_3;
1824 break;
1825 }
1826 }
1827
1828 return dst;
1829 }
1830
1831 /* Produce 2-byte codes for encoded composition rule RULE. */
1832
1833 #define ENCODE_COMPOSITION_RULE(rule) \
1834 do { \
1835 int gref, nref; \
1836 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1837 *dst++ = 32 + 81 + gref; \
1838 *dst++ = 32 + nref; \
1839 } while (0)
1840
1841 /* Produce codes for indicating the start of a composition sequence
1842 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1843 which specify information about the composition. See the comment
1844 in coding.h for the format of DATA. */
1845
1846 #define ENCODE_COMPOSITION_START(coding, data) \
1847 do { \
1848 coding->composing = data[3]; \
1849 *dst++ = ISO_CODE_ESC; \
1850 if (coding->composing == COMPOSITION_RELATIVE) \
1851 *dst++ = '0'; \
1852 else \
1853 { \
1854 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1855 ? '3' : '4'); \
1856 coding->cmp_data_index = coding->cmp_data_start + 4; \
1857 coding->composition_rule_follows = 0; \
1858 } \
1859 } while (0)
1860
1861 /* Produce codes for indicating the end of the current composition. */
1862
1863 #define ENCODE_COMPOSITION_END(coding, data) \
1864 do { \
1865 *dst++ = ISO_CODE_ESC; \
1866 *dst++ = '1'; \
1867 coding->cmp_data_start += data[0]; \
1868 coding->composing = COMPOSITION_NO; \
1869 if (coding->cmp_data_start == coding->cmp_data->used \
1870 && coding->cmp_data->next) \
1871 { \
1872 coding->cmp_data = coding->cmp_data->next; \
1873 coding->cmp_data_start = 0; \
1874 } \
1875 } while (0)
1876
1877 /* Produce composition start sequence ESC 0. Here, this sequence
1878 doesn't mean the start of a new composition but means that we have
1879 just produced components (alternate chars and composition rules) of
1880 the composition and the actual text follows in SRC. */
1881
1882 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1883 do { \
1884 *dst++ = ISO_CODE_ESC; \
1885 *dst++ = '0'; \
1886 coding->composing = COMPOSITION_RELATIVE; \
1887 } while (0)
1888
1889 /* The following three macros produce codes for indicating direction
1890 of text. */
1891 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1892 do { \
1893 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1894 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1895 else \
1896 *dst++ = ISO_CODE_CSI; \
1897 } while (0)
1898
1899 #define ENCODE_DIRECTION_R2L \
1900 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1901
1902 #define ENCODE_DIRECTION_L2R \
1903 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1904
1905 /* Produce codes for designation and invocation to reset the graphic
1906 planes and registers to initial state. */
1907 #define ENCODE_RESET_PLANE_AND_REGISTER \
1908 do { \
1909 int reg; \
1910 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1911 ENCODE_SHIFT_IN; \
1912 for (reg = 0; reg < 4; reg++) \
1913 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1914 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1915 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1916 ENCODE_DESIGNATION \
1917 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1918 } while (0)
1919
1920 /* Produce designation sequences of charsets in the line started from
1921 SRC to a place pointed by DST, and return updated DST.
1922
1923 If the current block ends before any end-of-line, we may fail to
1924 find all the necessary designations. */
1925
1926 static unsigned char *
1927 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1928 struct coding_system *coding;
1929 Lisp_Object translation_table;
1930 unsigned char *src, *src_end, *dst;
1931 {
1932 int charset, c, found = 0, reg;
1933 /* Table of charsets to be designated to each graphic register. */
1934 int r[4];
1935
1936 for (reg = 0; reg < 4; reg++)
1937 r[reg] = -1;
1938
1939 while (found < 4)
1940 {
1941 ONE_MORE_CHAR (c);
1942 if (c == '\n')
1943 break;
1944
1945 charset = CHAR_CHARSET (c);
1946 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1947 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1948 {
1949 found++;
1950 r[reg] = charset;
1951 }
1952 }
1953
1954 label_end_of_loop:
1955 if (found)
1956 {
1957 for (reg = 0; reg < 4; reg++)
1958 if (r[reg] >= 0
1959 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1960 ENCODE_DESIGNATION (r[reg], reg, coding);
1961 }
1962
1963 return dst;
1964 }
1965
1966 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1967
1968 static void
1969 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1970 struct coding_system *coding;
1971 unsigned char *source, *destination;
1972 int src_bytes, dst_bytes;
1973 {
1974 unsigned char *src = source;
1975 unsigned char *src_end = source + src_bytes;
1976 unsigned char *dst = destination;
1977 unsigned char *dst_end = destination + dst_bytes;
1978 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1979 from DST_END to assure overflow checking is necessary only at the
1980 head of loop. */
1981 unsigned char *adjusted_dst_end = dst_end - 19;
1982 /* SRC_BASE remembers the start position in source in each loop.
1983 The loop will be exited when there's not enough source text to
1984 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1985 there's not enough destination area to produce encoded codes
1986 (within macro EMIT_BYTES). */
1987 unsigned char *src_base;
1988 int c;
1989 Lisp_Object translation_table;
1990
1991 if (NILP (Venable_character_translation))
1992 translation_table = Qnil;
1993 else
1994 {
1995 translation_table = coding->translation_table_for_encode;
1996 if (NILP (translation_table))
1997 translation_table = Vstandard_translation_table_for_encode;
1998 }
1999
2000 coding->consumed_char = 0;
2001 coding->errors = 0;
2002 while (1)
2003 {
2004 int charset, c1, c2;
2005
2006 src_base = src;
2007
2008 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2009 {
2010 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2011 break;
2012 }
2013
2014 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2015 && CODING_SPEC_ISO_BOL (coding))
2016 {
2017 /* We have to produce designation sequences if any now. */
2018 dst = encode_designation_at_bol (coding, translation_table,
2019 src, src_end, dst);
2020 CODING_SPEC_ISO_BOL (coding) = 0;
2021 }
2022
2023 /* Check composition start and end. */
2024 if (coding->composing != COMPOSITION_DISABLED
2025 && coding->cmp_data_start < coding->cmp_data->used)
2026 {
2027 struct composition_data *cmp_data = coding->cmp_data;
2028 int *data = cmp_data->data + coding->cmp_data_start;
2029 int this_pos = cmp_data->char_offset + coding->consumed_char;
2030
2031 if (coding->composing == COMPOSITION_RELATIVE)
2032 {
2033 if (this_pos == data[2])
2034 {
2035 ENCODE_COMPOSITION_END (coding, data);
2036 cmp_data = coding->cmp_data;
2037 data = cmp_data->data + coding->cmp_data_start;
2038 }
2039 }
2040 else if (COMPOSING_P (coding))
2041 {
2042 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2043 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2044 /* We have consumed components of the composition.
2045 What follows in SRC is the compositions's base
2046 text. */
2047 ENCODE_COMPOSITION_FAKE_START (coding);
2048 else
2049 {
2050 int c = cmp_data->data[coding->cmp_data_index++];
2051 if (coding->composition_rule_follows)
2052 {
2053 ENCODE_COMPOSITION_RULE (c);
2054 coding->composition_rule_follows = 0;
2055 }
2056 else
2057 {
2058 SPLIT_CHAR (c, charset, c1, c2);
2059 ENCODE_ISO_CHARACTER (charset, c1, c2);
2060 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2061 coding->composition_rule_follows = 1;
2062 }
2063 continue;
2064 }
2065 }
2066 if (!COMPOSING_P (coding))
2067 {
2068 if (this_pos == data[1])
2069 {
2070 ENCODE_COMPOSITION_START (coding, data);
2071 continue;
2072 }
2073 }
2074 }
2075
2076 ONE_MORE_CHAR (c);
2077
2078 /* Now encode the character C. */
2079 if (c < 0x20 || c == 0x7F)
2080 {
2081 if (c == '\r')
2082 {
2083 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2084 {
2085 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2086 ENCODE_RESET_PLANE_AND_REGISTER;
2087 *dst++ = c;
2088 continue;
2089 }
2090 /* fall down to treat '\r' as '\n' ... */
2091 c = '\n';
2092 }
2093 if (c == '\n')
2094 {
2095 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2096 ENCODE_RESET_PLANE_AND_REGISTER;
2097 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2098 bcopy (coding->spec.iso2022.initial_designation,
2099 coding->spec.iso2022.current_designation,
2100 sizeof coding->spec.iso2022.initial_designation);
2101 if (coding->eol_type == CODING_EOL_LF
2102 || coding->eol_type == CODING_EOL_UNDECIDED)
2103 *dst++ = ISO_CODE_LF;
2104 else if (coding->eol_type == CODING_EOL_CRLF)
2105 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2106 else
2107 *dst++ = ISO_CODE_CR;
2108 CODING_SPEC_ISO_BOL (coding) = 1;
2109 }
2110 else
2111 {
2112 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2113 ENCODE_RESET_PLANE_AND_REGISTER;
2114 *dst++ = c;
2115 }
2116 }
2117 else if (ASCII_BYTE_P (c))
2118 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2119 else if (SINGLE_BYTE_CHAR_P (c))
2120 {
2121 *dst++ = c;
2122 coding->errors++;
2123 }
2124 else
2125 {
2126 SPLIT_CHAR (c, charset, c1, c2);
2127 ENCODE_ISO_CHARACTER (charset, c1, c2);
2128 }
2129
2130 coding->consumed_char++;
2131 }
2132
2133 label_end_of_loop:
2134 coding->consumed = src_base - source;
2135 coding->produced = coding->produced_char = dst - destination;
2136 }
2137
2138 \f
2139 /*** 4. SJIS and BIG5 handlers ***/
2140
2141 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2142 quite widely. So, for the moment, Emacs supports them in the bare
2143 C code. But, in the future, they may be supported only by CCL. */
2144
2145 /* SJIS is a coding system encoding three character sets: ASCII, right
2146 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2147 as is. A character of charset katakana-jisx0201 is encoded by
2148 "position-code + 0x80". A character of charset japanese-jisx0208
2149 is encoded in 2-byte but two position-codes are divided and shifted
2150 so that it fit in the range below.
2151
2152 --- CODE RANGE of SJIS ---
2153 (character set) (range)
2154 ASCII 0x00 .. 0x7F
2155 KATAKANA-JISX0201 0xA0 .. 0xDF
2156 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2157 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2158 -------------------------------
2159
2160 */
2161
2162 /* BIG5 is a coding system encoding two character sets: ASCII and
2163 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2164 character set and is encoded in two-byte.
2165
2166 --- CODE RANGE of BIG5 ---
2167 (character set) (range)
2168 ASCII 0x00 .. 0x7F
2169 Big5 (1st byte) 0xA1 .. 0xFE
2170 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2171 --------------------------
2172
2173 Since the number of characters in Big5 is larger than maximum
2174 characters in Emacs' charset (96x96), it can't be handled as one
2175 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2176 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2177 contains frequently used characters and the latter contains less
2178 frequently used characters. */
2179
2180 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2181 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2182 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2183 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2184
2185 /* Number of Big5 characters which have the same code in 1st byte. */
2186 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2187
2188 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2189 do { \
2190 unsigned int temp \
2191 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2192 if (b1 < 0xC9) \
2193 charset = charset_big5_1; \
2194 else \
2195 { \
2196 charset = charset_big5_2; \
2197 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2198 } \
2199 c1 = temp / (0xFF - 0xA1) + 0x21; \
2200 c2 = temp % (0xFF - 0xA1) + 0x21; \
2201 } while (0)
2202
2203 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2204 do { \
2205 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2206 if (charset == charset_big5_2) \
2207 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2208 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2209 b2 = temp % BIG5_SAME_ROW; \
2210 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2211 } while (0)
2212
2213 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2214 Check if a text is encoded in SJIS. If it is, return
2215 CODING_CATEGORY_MASK_SJIS, else return 0. */
2216
2217 int
2218 detect_coding_sjis (src, src_end)
2219 unsigned char *src, *src_end;
2220 {
2221 int c;
2222 /* Dummy for ONE_MORE_BYTE. */
2223 struct coding_system dummy_coding;
2224 struct coding_system *coding = &dummy_coding;
2225
2226 while (1)
2227 {
2228 ONE_MORE_BYTE (c);
2229 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2230 {
2231 ONE_MORE_BYTE (c);
2232 if (c < 0x40)
2233 return 0;
2234 }
2235 }
2236 label_end_of_loop:
2237 return CODING_CATEGORY_MASK_SJIS;
2238 }
2239
2240 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2241 Check if a text is encoded in BIG5. If it is, return
2242 CODING_CATEGORY_MASK_BIG5, else return 0. */
2243
2244 int
2245 detect_coding_big5 (src, src_end)
2246 unsigned char *src, *src_end;
2247 {
2248 int c;
2249 /* Dummy for ONE_MORE_BYTE. */
2250 struct coding_system dummy_coding;
2251 struct coding_system *coding = &dummy_coding;
2252
2253 while (1)
2254 {
2255 ONE_MORE_BYTE (c);
2256 if (c >= 0xA1)
2257 {
2258 ONE_MORE_BYTE (c);
2259 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2260 return 0;
2261 }
2262 }
2263 label_end_of_loop:
2264 return CODING_CATEGORY_MASK_BIG5;
2265 }
2266
2267 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2268 Check if a text is encoded in UTF-8. If it is, return
2269 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2270
2271 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2272 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2273 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2274 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2275 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2276 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2277 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2278
2279 int
2280 detect_coding_utf_8 (src, src_end)
2281 unsigned char *src, *src_end;
2282 {
2283 unsigned char c;
2284 int seq_maybe_bytes;
2285 /* Dummy for ONE_MORE_BYTE. */
2286 struct coding_system dummy_coding;
2287 struct coding_system *coding = &dummy_coding;
2288
2289 while (1)
2290 {
2291 ONE_MORE_BYTE (c);
2292 if (UTF_8_1_OCTET_P (c))
2293 continue;
2294 else if (UTF_8_2_OCTET_LEADING_P (c))
2295 seq_maybe_bytes = 1;
2296 else if (UTF_8_3_OCTET_LEADING_P (c))
2297 seq_maybe_bytes = 2;
2298 else if (UTF_8_4_OCTET_LEADING_P (c))
2299 seq_maybe_bytes = 3;
2300 else if (UTF_8_5_OCTET_LEADING_P (c))
2301 seq_maybe_bytes = 4;
2302 else if (UTF_8_6_OCTET_LEADING_P (c))
2303 seq_maybe_bytes = 5;
2304 else
2305 return 0;
2306
2307 do
2308 {
2309 ONE_MORE_BYTE (c);
2310 if (!UTF_8_EXTRA_OCTET_P (c))
2311 return 0;
2312 seq_maybe_bytes--;
2313 }
2314 while (seq_maybe_bytes > 0);
2315 }
2316
2317 label_end_of_loop:
2318 return CODING_CATEGORY_MASK_UTF_8;
2319 }
2320
2321 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2322 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2323 Little Endian (otherwise). If it is, return
2324 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2325 else return 0. */
2326
2327 #define UTF_16_INVALID_P(val) \
2328 (((val) == 0xFFFE) \
2329 || ((val) == 0xFFFF))
2330
2331 #define UTF_16_HIGH_SURROGATE_P(val) \
2332 (((val) & 0xD800) == 0xD800)
2333
2334 #define UTF_16_LOW_SURROGATE_P(val) \
2335 (((val) & 0xDC00) == 0xDC00)
2336
2337 int
2338 detect_coding_utf_16 (src, src_end)
2339 unsigned char *src, *src_end;
2340 {
2341 unsigned char c1, c2;
2342 /* Dummy for TWO_MORE_BYTES. */
2343 struct coding_system dummy_coding;
2344 struct coding_system *coding = &dummy_coding;
2345
2346 TWO_MORE_BYTES (c1, c2);
2347
2348 if ((c1 == 0xFF) && (c2 == 0xFE))
2349 return CODING_CATEGORY_MASK_UTF_16_LE;
2350 else if ((c1 == 0xFE) && (c2 == 0xFF))
2351 return CODING_CATEGORY_MASK_UTF_16_BE;
2352
2353 label_end_of_loop:
2354 return 0;
2355 }
2356
2357 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2358 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2359
2360 static void
2361 decode_coding_sjis_big5 (coding, source, destination,
2362 src_bytes, dst_bytes, sjis_p)
2363 struct coding_system *coding;
2364 unsigned char *source, *destination;
2365 int src_bytes, dst_bytes;
2366 int sjis_p;
2367 {
2368 unsigned char *src = source;
2369 unsigned char *src_end = source + src_bytes;
2370 unsigned char *dst = destination;
2371 unsigned char *dst_end = destination + dst_bytes;
2372 /* SRC_BASE remembers the start position in source in each loop.
2373 The loop will be exited when there's not enough source code
2374 (within macro ONE_MORE_BYTE), or when there's not enough
2375 destination area to produce a character (within macro
2376 EMIT_CHAR). */
2377 unsigned char *src_base;
2378 Lisp_Object translation_table;
2379
2380 if (NILP (Venable_character_translation))
2381 translation_table = Qnil;
2382 else
2383 {
2384 translation_table = coding->translation_table_for_decode;
2385 if (NILP (translation_table))
2386 translation_table = Vstandard_translation_table_for_decode;
2387 }
2388
2389 coding->produced_char = 0;
2390 while (1)
2391 {
2392 int c, charset, c1, c2;
2393
2394 src_base = src;
2395 ONE_MORE_BYTE (c1);
2396
2397 if (c1 < 0x80)
2398 {
2399 charset = CHARSET_ASCII;
2400 if (c1 < 0x20)
2401 {
2402 if (c1 == '\r')
2403 {
2404 if (coding->eol_type == CODING_EOL_CRLF)
2405 {
2406 ONE_MORE_BYTE (c2);
2407 if (c2 == '\n')
2408 c1 = c2;
2409 else if (coding->mode
2410 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2411 {
2412 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2413 goto label_end_of_loop;
2414 }
2415 else
2416 /* To process C2 again, SRC is subtracted by 1. */
2417 src--;
2418 }
2419 else if (coding->eol_type == CODING_EOL_CR)
2420 c1 = '\n';
2421 }
2422 else if (c1 == '\n'
2423 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2424 && (coding->eol_type == CODING_EOL_CR
2425 || coding->eol_type == CODING_EOL_CRLF))
2426 {
2427 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2428 goto label_end_of_loop;
2429 }
2430 }
2431 }
2432 else
2433 {
2434 if (sjis_p)
2435 {
2436 if (c1 >= 0xF0)
2437 goto label_invalid_code;
2438 if (c1 < 0xA0 || c1 >= 0xE0)
2439 {
2440 /* SJIS -> JISX0208 */
2441 ONE_MORE_BYTE (c2);
2442 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2443 goto label_invalid_code;
2444 DECODE_SJIS (c1, c2, c1, c2);
2445 charset = charset_jisx0208;
2446 }
2447 else
2448 /* SJIS -> JISX0201-Kana */
2449 charset = charset_katakana_jisx0201;
2450 }
2451 else
2452 {
2453 /* BIG5 -> Big5 */
2454 if (c1 < 0xA1 || c1 > 0xFE)
2455 goto label_invalid_code;
2456 ONE_MORE_BYTE (c2);
2457 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2458 goto label_invalid_code;
2459 DECODE_BIG5 (c1, c2, charset, c1, c2);
2460 }
2461 }
2462
2463 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2464 EMIT_CHAR (c);
2465 continue;
2466
2467 label_invalid_code:
2468 coding->errors++;
2469 src = src_base;
2470 c = *src++;
2471 EMIT_CHAR (c);
2472 }
2473
2474 label_end_of_loop:
2475 coding->consumed = coding->consumed_char = src_base - source;
2476 coding->produced = dst - destination;
2477 return;
2478 }
2479
2480 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2481 This function can encode charsets `ascii', `katakana-jisx0201',
2482 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2483 are sure that all these charsets are registered as official charset
2484 (i.e. do not have extended leading-codes). Characters of other
2485 charsets are produced without any encoding. If SJIS_P is 1, encode
2486 SJIS text, else encode BIG5 text. */
2487
2488 static void
2489 encode_coding_sjis_big5 (coding, source, destination,
2490 src_bytes, dst_bytes, sjis_p)
2491 struct coding_system *coding;
2492 unsigned char *source, *destination;
2493 int src_bytes, dst_bytes;
2494 int sjis_p;
2495 {
2496 unsigned char *src = source;
2497 unsigned char *src_end = source + src_bytes;
2498 unsigned char *dst = destination;
2499 unsigned char *dst_end = destination + dst_bytes;
2500 /* SRC_BASE remembers the start position in source in each loop.
2501 The loop will be exited when there's not enough source text to
2502 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2503 there's not enough destination area to produce encoded codes
2504 (within macro EMIT_BYTES). */
2505 unsigned char *src_base;
2506 Lisp_Object translation_table;
2507
2508 if (NILP (Venable_character_translation))
2509 translation_table = Qnil;
2510 else
2511 {
2512 translation_table = coding->translation_table_for_decode;
2513 if (NILP (translation_table))
2514 translation_table = Vstandard_translation_table_for_decode;
2515 }
2516
2517 while (1)
2518 {
2519 int c, charset, c1, c2;
2520
2521 src_base = src;
2522 ONE_MORE_CHAR (c);
2523
2524 /* Now encode the character C. */
2525 if (SINGLE_BYTE_CHAR_P (c))
2526 {
2527 switch (c)
2528 {
2529 case '\r':
2530 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2531 {
2532 EMIT_ONE_BYTE (c);
2533 break;
2534 }
2535 c = '\n';
2536 case '\n':
2537 if (coding->eol_type == CODING_EOL_CRLF)
2538 {
2539 EMIT_TWO_BYTES ('\r', c);
2540 break;
2541 }
2542 else if (coding->eol_type == CODING_EOL_CR)
2543 c = '\r';
2544 default:
2545 EMIT_ONE_BYTE (c);
2546 }
2547 }
2548 else
2549 {
2550 SPLIT_CHAR (c, charset, c1, c2);
2551 if (sjis_p)
2552 {
2553 if (charset == charset_jisx0208
2554 || charset == charset_jisx0208_1978)
2555 {
2556 ENCODE_SJIS (c1, c2, c1, c2);
2557 EMIT_TWO_BYTES (c1, c2);
2558 }
2559 else if (charset == charset_latin_jisx0201)
2560 EMIT_ONE_BYTE (c1);
2561 else
2562 /* There's no way other than producing the internal
2563 codes as is. */
2564 EMIT_BYTES (src_base, src);
2565 }
2566 else
2567 {
2568 if (charset == charset_big5_1 || charset == charset_big5_2)
2569 {
2570 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2571 EMIT_TWO_BYTES (c1, c2);
2572 }
2573 else
2574 /* There's no way other than producing the internal
2575 codes as is. */
2576 EMIT_BYTES (src_base, src);
2577 }
2578 }
2579 coding->consumed_char++;
2580 }
2581
2582 label_end_of_loop:
2583 coding->consumed = src_base - source;
2584 coding->produced = coding->produced_char = dst - destination;
2585 }
2586
2587 \f
2588 /*** 5. CCL handlers ***/
2589
2590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2591 Check if a text is encoded in a coding system of which
2592 encoder/decoder are written in CCL program. If it is, return
2593 CODING_CATEGORY_MASK_CCL, else return 0. */
2594
2595 int
2596 detect_coding_ccl (src, src_end)
2597 unsigned char *src, *src_end;
2598 {
2599 unsigned char *valid;
2600 int c;
2601 /* Dummy for ONE_MORE_BYTE. */
2602 struct coding_system dummy_coding;
2603 struct coding_system *coding = &dummy_coding;
2604
2605 /* No coding system is assigned to coding-category-ccl. */
2606 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2607 return 0;
2608
2609 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2610 while (1)
2611 {
2612 ONE_MORE_BYTE (c);
2613 if (! valid[c])
2614 return 0;
2615 }
2616 label_end_of_loop:
2617 return CODING_CATEGORY_MASK_CCL;
2618 }
2619
2620 \f
2621 /*** 6. End-of-line handlers ***/
2622
2623 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2624
2625 static void
2626 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2627 struct coding_system *coding;
2628 unsigned char *source, *destination;
2629 int src_bytes, dst_bytes;
2630 {
2631 unsigned char *src = source;
2632 unsigned char *dst = destination;
2633 unsigned char *src_end = src + src_bytes;
2634 unsigned char *dst_end = dst + dst_bytes;
2635 Lisp_Object translation_table;
2636 /* SRC_BASE remembers the start position in source in each loop.
2637 The loop will be exited when there's not enough source code
2638 (within macro ONE_MORE_BYTE), or when there's not enough
2639 destination area to produce a character (within macro
2640 EMIT_CHAR). */
2641 unsigned char *src_base;
2642 int c;
2643
2644 translation_table = Qnil;
2645 switch (coding->eol_type)
2646 {
2647 case CODING_EOL_CRLF:
2648 while (1)
2649 {
2650 src_base = src;
2651 ONE_MORE_BYTE (c);
2652 if (c == '\r')
2653 {
2654 ONE_MORE_BYTE (c);
2655 if (c != '\n')
2656 {
2657 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2658 {
2659 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2660 goto label_end_of_loop;
2661 }
2662 src--;
2663 c = '\r';
2664 }
2665 }
2666 else if (c == '\n'
2667 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2668 {
2669 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2670 goto label_end_of_loop;
2671 }
2672 EMIT_CHAR (c);
2673 }
2674 break;
2675
2676 case CODING_EOL_CR:
2677 while (1)
2678 {
2679 src_base = src;
2680 ONE_MORE_BYTE (c);
2681 if (c == '\n')
2682 {
2683 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2684 {
2685 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2686 goto label_end_of_loop;
2687 }
2688 }
2689 else if (c == '\r')
2690 c = '\n';
2691 EMIT_CHAR (c);
2692 }
2693 break;
2694
2695 default: /* no need for EOL handling */
2696 while (1)
2697 {
2698 src_base = src;
2699 ONE_MORE_BYTE (c);
2700 EMIT_CHAR (c);
2701 }
2702 }
2703
2704 label_end_of_loop:
2705 coding->consumed = coding->consumed_char = src_base - source;
2706 coding->produced = dst - destination;
2707 return;
2708 }
2709
2710 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2711 format of end-of-line according to `coding->eol_type'. It also
2712 convert multibyte form 8-bit characers to unibyte if
2713 CODING->src_multibyte is nonzero. If `coding->mode &
2714 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2715 also means end-of-line. */
2716
2717 static void
2718 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2719 struct coding_system *coding;
2720 unsigned char *source, *destination;
2721 int src_bytes, dst_bytes;
2722 {
2723 unsigned char *src = source;
2724 unsigned char *dst = destination;
2725 unsigned char *src_end = src + src_bytes;
2726 unsigned char *dst_end = dst + dst_bytes;
2727 Lisp_Object translation_table;
2728 /* SRC_BASE remembers the start position in source in each loop.
2729 The loop will be exited when there's not enough source text to
2730 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2731 there's not enough destination area to produce encoded codes
2732 (within macro EMIT_BYTES). */
2733 unsigned char *src_base;
2734 int c;
2735 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2736
2737 translation_table = Qnil;
2738 if (coding->src_multibyte
2739 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2740 {
2741 src_end--;
2742 src_bytes--;
2743 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2744 }
2745
2746 if (coding->eol_type == CODING_EOL_CRLF)
2747 {
2748 while (src < src_end)
2749 {
2750 src_base = src;
2751 c = *src++;
2752 if (c >= 0x20)
2753 EMIT_ONE_BYTE (c);
2754 else if (c == '\n' || (c == '\r' && selective_display))
2755 EMIT_TWO_BYTES ('\r', '\n');
2756 else
2757 EMIT_ONE_BYTE (c);
2758 }
2759 src_base = src;
2760 label_end_of_loop:
2761 ;
2762 }
2763 else
2764 {
2765 if (src_bytes <= dst_bytes)
2766 {
2767 safe_bcopy (src, dst, src_bytes);
2768 src_base = src_end;
2769 dst += src_bytes;
2770 }
2771 else
2772 {
2773 if (coding->src_multibyte
2774 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2775 dst_bytes--;
2776 safe_bcopy (src, dst, dst_bytes);
2777 src_base = src + dst_bytes;
2778 dst = destination + dst_bytes;
2779 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2780 }
2781 if (coding->eol_type == CODING_EOL_CR)
2782 {
2783 for (src = destination; src < dst; src++)
2784 if (*src == '\n') *src = '\r';
2785 }
2786 else if (selective_display)
2787 {
2788 for (src = destination; src < dst; src++)
2789 if (*src == '\r') *src = '\n';
2790 }
2791 }
2792 if (coding->src_multibyte)
2793 dst = destination + str_as_unibyte (destination, dst - destination);
2794
2795 coding->consumed = src_base - source;
2796 coding->produced = dst - destination;
2797 }
2798
2799 \f
2800 /*** 7. C library functions ***/
2801
2802 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2803 has a property `coding-system'. The value of this property is a
2804 vector of length 5 (called as coding-vector). Among elements of
2805 this vector, the first (element[0]) and the fifth (element[4])
2806 carry important information for decoding/encoding. Before
2807 decoding/encoding, this information should be set in fields of a
2808 structure of type `coding_system'.
2809
2810 A value of property `coding-system' can be a symbol of another
2811 subsidiary coding-system. In that case, Emacs gets coding-vector
2812 from that symbol.
2813
2814 `element[0]' contains information to be set in `coding->type'. The
2815 value and its meaning is as follows:
2816
2817 0 -- coding_type_emacs_mule
2818 1 -- coding_type_sjis
2819 2 -- coding_type_iso2022
2820 3 -- coding_type_big5
2821 4 -- coding_type_ccl encoder/decoder written in CCL
2822 nil -- coding_type_no_conversion
2823 t -- coding_type_undecided (automatic conversion on decoding,
2824 no-conversion on encoding)
2825
2826 `element[4]' contains information to be set in `coding->flags' and
2827 `coding->spec'. The meaning varies by `coding->type'.
2828
2829 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2830 of length 32 (of which the first 13 sub-elements are used now).
2831 Meanings of these sub-elements are:
2832
2833 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2834 If the value is an integer of valid charset, the charset is
2835 assumed to be designated to graphic register N initially.
2836
2837 If the value is minus, it is a minus value of charset which
2838 reserves graphic register N, which means that the charset is
2839 not designated initially but should be designated to graphic
2840 register N just before encoding a character in that charset.
2841
2842 If the value is nil, graphic register N is never used on
2843 encoding.
2844
2845 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2846 Each value takes t or nil. See the section ISO2022 of
2847 `coding.h' for more information.
2848
2849 If `coding->type' is `coding_type_big5', element[4] is t to denote
2850 BIG5-ETen or nil to denote BIG5-HKU.
2851
2852 If `coding->type' takes the other value, element[4] is ignored.
2853
2854 Emacs Lisp's coding system also carries information about format of
2855 end-of-line in a value of property `eol-type'. If the value is
2856 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2857 means CODING_EOL_CR. If it is not integer, it should be a vector
2858 of subsidiary coding systems of which property `eol-type' has one
2859 of above values.
2860
2861 */
2862
2863 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2864 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2865 is setup so that no conversion is necessary and return -1, else
2866 return 0. */
2867
2868 int
2869 setup_coding_system (coding_system, coding)
2870 Lisp_Object coding_system;
2871 struct coding_system *coding;
2872 {
2873 Lisp_Object coding_spec, coding_type, eol_type, plist;
2874 Lisp_Object val;
2875 int i;
2876
2877 /* Initialize some fields required for all kinds of coding systems. */
2878 coding->symbol = coding_system;
2879 coding->common_flags = 0;
2880 coding->mode = 0;
2881 coding->heading_ascii = -1;
2882 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2883 coding->composing = COMPOSITION_DISABLED;
2884 coding->cmp_data = NULL;
2885
2886 if (NILP (coding_system))
2887 goto label_invalid_coding_system;
2888
2889 coding_spec = Fget (coding_system, Qcoding_system);
2890
2891 if (!VECTORP (coding_spec)
2892 || XVECTOR (coding_spec)->size != 5
2893 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2894 goto label_invalid_coding_system;
2895
2896 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2897 if (VECTORP (eol_type))
2898 {
2899 coding->eol_type = CODING_EOL_UNDECIDED;
2900 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2901 }
2902 else if (XFASTINT (eol_type) == 1)
2903 {
2904 coding->eol_type = CODING_EOL_CRLF;
2905 coding->common_flags
2906 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2907 }
2908 else if (XFASTINT (eol_type) == 2)
2909 {
2910 coding->eol_type = CODING_EOL_CR;
2911 coding->common_flags
2912 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2913 }
2914 else
2915 coding->eol_type = CODING_EOL_LF;
2916
2917 coding_type = XVECTOR (coding_spec)->contents[0];
2918 /* Try short cut. */
2919 if (SYMBOLP (coding_type))
2920 {
2921 if (EQ (coding_type, Qt))
2922 {
2923 coding->type = coding_type_undecided;
2924 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2925 }
2926 else
2927 coding->type = coding_type_no_conversion;
2928 return 0;
2929 }
2930
2931 /* Get values of coding system properties:
2932 `post-read-conversion', `pre-write-conversion',
2933 `translation-table-for-decode', `translation-table-for-encode'. */
2934 plist = XVECTOR (coding_spec)->contents[3];
2935 /* Pre & post conversion functions should be disabled if
2936 inhibit_eol_conversion is nozero. This is the case that a code
2937 conversion function is called while those functions are running. */
2938 if (! inhibit_pre_post_conversion)
2939 {
2940 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2941 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2942 }
2943 val = Fplist_get (plist, Qtranslation_table_for_decode);
2944 if (SYMBOLP (val))
2945 val = Fget (val, Qtranslation_table_for_decode);
2946 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2947 val = Fplist_get (plist, Qtranslation_table_for_encode);
2948 if (SYMBOLP (val))
2949 val = Fget (val, Qtranslation_table_for_encode);
2950 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2951 val = Fplist_get (plist, Qcoding_category);
2952 if (!NILP (val))
2953 {
2954 val = Fget (val, Qcoding_category_index);
2955 if (INTEGERP (val))
2956 coding->category_idx = XINT (val);
2957 else
2958 goto label_invalid_coding_system;
2959 }
2960 else
2961 goto label_invalid_coding_system;
2962
2963 val = Fplist_get (plist, Qsafe_charsets);
2964 if (EQ (val, Qt))
2965 {
2966 for (i = 0; i <= MAX_CHARSET; i++)
2967 coding->safe_charsets[i] = 1;
2968 }
2969 else
2970 {
2971 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2972 while (CONSP (val))
2973 {
2974 if ((i = get_charset_id (XCAR (val))) >= 0)
2975 coding->safe_charsets[i] = 1;
2976 val = XCDR (val);
2977 }
2978 }
2979
2980 /* If the coding system has non-nil `composition' property, enable
2981 composition handling. */
2982 val = Fplist_get (plist, Qcomposition);
2983 if (!NILP (val))
2984 coding->composing = COMPOSITION_NO;
2985
2986 switch (XFASTINT (coding_type))
2987 {
2988 case 0:
2989 coding->type = coding_type_emacs_mule;
2990 if (!NILP (coding->post_read_conversion))
2991 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2992 if (!NILP (coding->pre_write_conversion))
2993 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2994 break;
2995
2996 case 1:
2997 coding->type = coding_type_sjis;
2998 coding->common_flags
2999 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3000 break;
3001
3002 case 2:
3003 coding->type = coding_type_iso2022;
3004 coding->common_flags
3005 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3006 {
3007 Lisp_Object val, temp;
3008 Lisp_Object *flags;
3009 int i, charset, reg_bits = 0;
3010
3011 val = XVECTOR (coding_spec)->contents[4];
3012
3013 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3014 goto label_invalid_coding_system;
3015
3016 flags = XVECTOR (val)->contents;
3017 coding->flags
3018 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3019 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3020 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3021 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3022 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3023 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3024 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3025 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3026 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3027 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3028 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3029 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3030 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3031 );
3032
3033 /* Invoke graphic register 0 to plane 0. */
3034 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3035 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3036 CODING_SPEC_ISO_INVOCATION (coding, 1)
3037 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3038 /* Not single shifting at first. */
3039 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3040 /* Beginning of buffer should also be regarded as bol. */
3041 CODING_SPEC_ISO_BOL (coding) = 1;
3042
3043 for (charset = 0; charset <= MAX_CHARSET; charset++)
3044 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3045 val = Vcharset_revision_alist;
3046 while (CONSP (val))
3047 {
3048 charset = get_charset_id (Fcar_safe (XCAR (val)));
3049 if (charset >= 0
3050 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3051 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3052 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3053 val = XCDR (val);
3054 }
3055
3056 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3057 FLAGS[REG] can be one of below:
3058 integer CHARSET: CHARSET occupies register I,
3059 t: designate nothing to REG initially, but can be used
3060 by any charsets,
3061 list of integer, nil, or t: designate the first
3062 element (if integer) to REG initially, the remaining
3063 elements (if integer) is designated to REG on request,
3064 if an element is t, REG can be used by any charsets,
3065 nil: REG is never used. */
3066 for (charset = 0; charset <= MAX_CHARSET; charset++)
3067 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3068 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3069 for (i = 0; i < 4; i++)
3070 {
3071 if (INTEGERP (flags[i])
3072 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3073 || (charset = get_charset_id (flags[i])) >= 0)
3074 {
3075 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3076 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3077 }
3078 else if (EQ (flags[i], Qt))
3079 {
3080 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3081 reg_bits |= 1 << i;
3082 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3083 }
3084 else if (CONSP (flags[i]))
3085 {
3086 Lisp_Object tail;
3087 tail = flags[i];
3088
3089 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3090 if (INTEGERP (XCAR (tail))
3091 && (charset = XINT (XCAR (tail)),
3092 CHARSET_VALID_P (charset))
3093 || (charset = get_charset_id (XCAR (tail))) >= 0)
3094 {
3095 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3096 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3097 }
3098 else
3099 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3100 tail = XCDR (tail);
3101 while (CONSP (tail))
3102 {
3103 if (INTEGERP (XCAR (tail))
3104 && (charset = XINT (XCAR (tail)),
3105 CHARSET_VALID_P (charset))
3106 || (charset = get_charset_id (XCAR (tail))) >= 0)
3107 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3108 = i;
3109 else if (EQ (XCAR (tail), Qt))
3110 reg_bits |= 1 << i;
3111 tail = XCDR (tail);
3112 }
3113 }
3114 else
3115 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3116
3117 CODING_SPEC_ISO_DESIGNATION (coding, i)
3118 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3119 }
3120
3121 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3122 {
3123 /* REG 1 can be used only by locking shift in 7-bit env. */
3124 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3125 reg_bits &= ~2;
3126 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3127 /* Without any shifting, only REG 0 and 1 can be used. */
3128 reg_bits &= 3;
3129 }
3130
3131 if (reg_bits)
3132 for (charset = 0; charset <= MAX_CHARSET; charset++)
3133 {
3134 if (CHARSET_VALID_P (charset))
3135 {
3136 /* There exist some default graphic registers to be
3137 used CHARSET. */
3138
3139 /* We had better avoid designating a charset of
3140 CHARS96 to REG 0 as far as possible. */
3141 if (CHARSET_CHARS (charset) == 96)
3142 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3143 = (reg_bits & 2
3144 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3145 else
3146 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3147 = (reg_bits & 1
3148 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3149 }
3150 }
3151 }
3152 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3153 coding->spec.iso2022.last_invalid_designation_register = -1;
3154 break;
3155
3156 case 3:
3157 coding->type = coding_type_big5;
3158 coding->common_flags
3159 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3160 coding->flags
3161 = (NILP (XVECTOR (coding_spec)->contents[4])
3162 ? CODING_FLAG_BIG5_HKU
3163 : CODING_FLAG_BIG5_ETEN);
3164 break;
3165
3166 case 4:
3167 coding->type = coding_type_ccl;
3168 coding->common_flags
3169 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3170 {
3171 val = XVECTOR (coding_spec)->contents[4];
3172 if (! CONSP (val)
3173 || setup_ccl_program (&(coding->spec.ccl.decoder),
3174 XCAR (val)) < 0
3175 || setup_ccl_program (&(coding->spec.ccl.encoder),
3176 XCDR (val)) < 0)
3177 goto label_invalid_coding_system;
3178
3179 bzero (coding->spec.ccl.valid_codes, 256);
3180 val = Fplist_get (plist, Qvalid_codes);
3181 if (CONSP (val))
3182 {
3183 Lisp_Object this;
3184
3185 for (; CONSP (val); val = XCDR (val))
3186 {
3187 this = XCAR (val);
3188 if (INTEGERP (this)
3189 && XINT (this) >= 0 && XINT (this) < 256)
3190 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3191 else if (CONSP (this)
3192 && INTEGERP (XCAR (this))
3193 && INTEGERP (XCDR (this)))
3194 {
3195 int start = XINT (XCAR (this));
3196 int end = XINT (XCDR (this));
3197
3198 if (start >= 0 && start <= end && end < 256)
3199 while (start <= end)
3200 coding->spec.ccl.valid_codes[start++] = 1;
3201 }
3202 }
3203 }
3204 }
3205 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3206 break;
3207
3208 case 5:
3209 coding->type = coding_type_raw_text;
3210 break;
3211
3212 default:
3213 goto label_invalid_coding_system;
3214 }
3215 return 0;
3216
3217 label_invalid_coding_system:
3218 coding->type = coding_type_no_conversion;
3219 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220 coding->common_flags = 0;
3221 coding->eol_type = CODING_EOL_LF;
3222 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223 return -1;
3224 }
3225
3226 /* Free memory blocks allocated for storing composition information. */
3227
3228 void
3229 coding_free_composition_data (coding)
3230 struct coding_system *coding;
3231 {
3232 struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234 if (!cmp_data)
3235 return;
3236 /* Memory blocks are chained. At first, rewind to the first, then,
3237 free blocks one by one. */
3238 while (cmp_data->prev)
3239 cmp_data = cmp_data->prev;
3240 while (cmp_data)
3241 {
3242 next = cmp_data->next;
3243 xfree (cmp_data);
3244 cmp_data = next;
3245 }
3246 coding->cmp_data = NULL;
3247 }
3248
3249 /* Set `char_offset' member of all memory blocks pointed by
3250 coding->cmp_data to POS. */
3251
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254 struct coding_system *coding;
3255 int pos;
3256 {
3257 struct composition_data *cmp_data;
3258
3259 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260 cmp_data->char_offset = pos;
3261 }
3262
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264 coding_system CODING according to the already setup value eol_type
3265 in CODING. CODING should be setup for some coding system in
3266 advance. */
3267
3268 void
3269 setup_raw_text_coding_system (coding)
3270 struct coding_system *coding;
3271 {
3272 if (coding->type != coding_type_raw_text)
3273 {
3274 coding->symbol = Qraw_text;
3275 coding->type = coding_type_raw_text;
3276 if (coding->eol_type != CODING_EOL_UNDECIDED)
3277 {
3278 Lisp_Object subsidiaries;
3279 subsidiaries = Fget (Qraw_text, Qeol_type);
3280
3281 if (VECTORP (subsidiaries)
3282 && XVECTOR (subsidiaries)->size == 3)
3283 coding->symbol
3284 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285 }
3286 setup_coding_system (coding->symbol, coding);
3287 }
3288 return;
3289 }
3290
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3293 it's impossible to distinguish some coding systems accurately
3294 because they use the same range of codes. So, at first, coding
3295 systems are categorized into 7, those are:
3296
3297 o coding-category-emacs-mule
3298
3299 The category for a coding system which has the same code range
3300 as Emacs' internal format. Assigned the coding-system (Lisp
3301 symbol) `emacs-mule' by default.
3302
3303 o coding-category-sjis
3304
3305 The category for a coding system which has the same code range
3306 as SJIS. Assigned the coding-system (Lisp
3307 symbol) `japanese-shift-jis' by default.
3308
3309 o coding-category-iso-7
3310
3311 The category for a coding system which has the same code range
3312 as ISO2022 of 7-bit environment. This doesn't use any locking
3313 shift and single shift functions. This can encode/decode all
3314 charsets. Assigned the coding-system (Lisp symbol)
3315 `iso-2022-7bit' by default.
3316
3317 o coding-category-iso-7-tight
3318
3319 Same as coding-category-iso-7 except that this can
3320 encode/decode only the specified charsets.
3321
3322 o coding-category-iso-8-1
3323
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environment and graphic plane 1 used only
3326 for DIMENSION1 charset. This doesn't use any locking shift
3327 and single shift functions. Assigned the coding-system (Lisp
3328 symbol) `iso-latin-1' by default.
3329
3330 o coding-category-iso-8-2
3331
3332 The category for a coding system which has the same code range
3333 as ISO2022 of 8-bit environment and graphic plane 1 used only
3334 for DIMENSION2 charset. This doesn't use any locking shift
3335 and single shift functions. Assigned the coding-system (Lisp
3336 symbol) `japanese-iso-8bit' by default.
3337
3338 o coding-category-iso-7-else
3339
3340 The category for a coding system which has the same code range
3341 as ISO2022 of 7-bit environemnt but uses locking shift or
3342 single shift functions. Assigned the coding-system (Lisp
3343 symbol) `iso-2022-7bit-lock' by default.
3344
3345 o coding-category-iso-8-else
3346
3347 The category for a coding system which has the same code range
3348 as ISO2022 of 8-bit environemnt but uses locking shift or
3349 single shift functions. Assigned the coding-system (Lisp
3350 symbol) `iso-2022-8bit-ss2' by default.
3351
3352 o coding-category-big5
3353
3354 The category for a coding system which has the same code range
3355 as BIG5. Assigned the coding-system (Lisp symbol)
3356 `cn-big5' by default.
3357
3358 o coding-category-utf-8
3359
3360 The category for a coding system which has the same code range
3361 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3362 symbol) `utf-8' by default.
3363
3364 o coding-category-utf-16-be
3365
3366 The category for a coding system in which a text has an
3367 Unicode signature (cf. Unicode Standard) in the order of BIG
3368 endian at the head. Assigned the coding-system (Lisp symbol)
3369 `utf-16-be' by default.
3370
3371 o coding-category-utf-16-le
3372
3373 The category for a coding system in which a text has an
3374 Unicode signature (cf. Unicode Standard) in the order of
3375 LITTLE endian at the head. Assigned the coding-system (Lisp
3376 symbol) `utf-16-le' by default.
3377
3378 o coding-category-ccl
3379
3380 The category for a coding system of which encoder/decoder is
3381 written in CCL programs. The default value is nil, i.e., no
3382 coding system is assigned.
3383
3384 o coding-category-binary
3385
3386 The category for a coding system not categorized in any of the
3387 above. Assigned the coding-system (Lisp symbol)
3388 `no-conversion' by default.
3389
3390 Each of them is a Lisp symbol and the value is an actual
3391 `coding-system's (this is also a Lisp symbol) assigned by a user.
3392 What Emacs does actually is to detect a category of coding system.
3393 Then, it uses a `coding-system' assigned to it. If Emacs can't
3394 decide only one possible category, it selects a category of the
3395 highest priority. Priorities of categories are also specified by a
3396 user in a Lisp variable `coding-category-list'.
3397
3398 */
3399
3400 static
3401 int ascii_skip_code[256];
3402
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404 If it detects possible coding systems, return an integer in which
3405 appropriate flag bits are set. Flag bits are defined by macros
3406 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3407 it should point the table `coding_priorities'. In that case, only
3408 the flag bit for a coding system of the highest priority is set in
3409 the returned value.
3410
3411 How many ASCII characters are at the head is returned as *SKIP. */
3412
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415 unsigned char *source;
3416 int src_bytes, *priorities, *skip;
3417 {
3418 register unsigned char c;
3419 unsigned char *src = source, *src_end = source + src_bytes;
3420 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421 int i, idx;
3422
3423 /* At first, skip all ASCII characters and control characters except
3424 for three ISO2022 specific control characters. */
3425 ascii_skip_code[ISO_CODE_SO] = 0;
3426 ascii_skip_code[ISO_CODE_SI] = 0;
3427 ascii_skip_code[ISO_CODE_ESC] = 0;
3428
3429 label_loop_detect_coding:
3430 while (src < src_end && ascii_skip_code[*src]) src++;
3431 *skip = src - source;
3432
3433 if (src >= src_end)
3434 /* We found nothing other than ASCII. There's nothing to do. */
3435 return 0;
3436
3437 c = *src;
3438 /* The text seems to be encoded in some multilingual coding system.
3439 Now, try to find in which coding system the text is encoded. */
3440 if (c < 0x80)
3441 {
3442 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443 /* C is an ISO2022 specific control code of C0. */
3444 mask = detect_coding_iso2022 (src, src_end);
3445 if (mask == 0)
3446 {
3447 /* No valid ISO2022 code follows C. Try again. */
3448 src++;
3449 if (c == ISO_CODE_ESC)
3450 ascii_skip_code[ISO_CODE_ESC] = 1;
3451 else
3452 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453 goto label_loop_detect_coding;
3454 }
3455 if (priorities)
3456 {
3457 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458 {
3459 if (mask & priorities[i])
3460 return priorities[i];
3461 }
3462 return CODING_CATEGORY_MASK_RAW_TEXT;
3463 }
3464 }
3465 else
3466 {
3467 int try;
3468
3469 if (c < 0xA0)
3470 {
3471 /* C is the first byte of SJIS character code,
3472 or a leading-code of Emacs' internal format (emacs-mule),
3473 or the first byte of UTF-16. */
3474 try = (CODING_CATEGORY_MASK_SJIS
3475 | CODING_CATEGORY_MASK_EMACS_MULE
3476 | CODING_CATEGORY_MASK_UTF_16_BE
3477 | CODING_CATEGORY_MASK_UTF_16_LE);
3478
3479 /* Or, if C is a special latin extra code,
3480 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481 or is an ISO2022 control-sequence-introducer (CSI),
3482 we should also consider the possibility of ISO2022 codings. */
3483 if ((VECTORP (Vlatin_extra_code_table)
3484 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486 || (c == ISO_CODE_CSI
3487 && (src < src_end
3488 && (*src == ']'
3489 || ((*src == '0' || *src == '1' || *src == '2')
3490 && src + 1 < src_end
3491 && src[1] == ']')))))
3492 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493 | CODING_CATEGORY_MASK_ISO_8BIT);
3494 }
3495 else
3496 /* C is a character of ISO2022 in graphic plane right,
3497 or a SJIS's 1-byte character code (i.e. JISX0201),
3498 or the first byte of BIG5's 2-byte code,
3499 or the first byte of UTF-8/16. */
3500 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501 | CODING_CATEGORY_MASK_ISO_8BIT
3502 | CODING_CATEGORY_MASK_SJIS
3503 | CODING_CATEGORY_MASK_BIG5
3504 | CODING_CATEGORY_MASK_UTF_8
3505 | CODING_CATEGORY_MASK_UTF_16_BE
3506 | CODING_CATEGORY_MASK_UTF_16_LE);
3507
3508 /* Or, we may have to consider the possibility of CCL. */
3509 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511 ->spec.ccl.valid_codes)[c])
3512 try |= CODING_CATEGORY_MASK_CCL;
3513
3514 mask = 0;
3515 utf16_examined_p = iso2022_examined_p = 0;
3516 if (priorities)
3517 {
3518 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519 {
3520 if (!iso2022_examined_p
3521 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522 {
3523 mask |= detect_coding_iso2022 (src, src_end);
3524 iso2022_examined_p = 1;
3525 }
3526 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527 mask |= detect_coding_sjis (src, src_end);
3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529 mask |= detect_coding_utf_8 (src, src_end);
3530 else if (!utf16_examined_p
3531 && (priorities[i] & try &
3532 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533 {
3534 mask |= detect_coding_utf_16 (src, src_end);
3535 utf16_examined_p = 1;
3536 }
3537 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538 mask |= detect_coding_big5 (src, src_end);
3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540 mask |= detect_coding_emacs_mule (src, src_end);
3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542 mask |= detect_coding_ccl (src, src_end);
3543 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546 mask |= CODING_CATEGORY_MASK_BINARY;
3547 if (mask & priorities[i])
3548 return priorities[i];
3549 }
3550 return CODING_CATEGORY_MASK_RAW_TEXT;
3551 }
3552 if (try & CODING_CATEGORY_MASK_ISO)
3553 mask |= detect_coding_iso2022 (src, src_end);
3554 if (try & CODING_CATEGORY_MASK_SJIS)
3555 mask |= detect_coding_sjis (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_BIG5)
3557 mask |= detect_coding_big5 (src, src_end);
3558 if (try & CODING_CATEGORY_MASK_UTF_8)
3559 mask |= detect_coding_utf_8 (src, src_end);
3560 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561 mask |= detect_coding_utf_16 (src, src_end);
3562 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563 mask |= detect_coding_emacs_mule (src, src_end);
3564 if (try & CODING_CATEGORY_MASK_CCL)
3565 mask |= detect_coding_ccl (src, src_end);
3566 }
3567 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3568 }
3569
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571 The information of the detected coding system is set in CODING. */
3572
3573 void
3574 detect_coding (coding, src, src_bytes)
3575 struct coding_system *coding;
3576 unsigned char *src;
3577 int src_bytes;
3578 {
3579 unsigned int idx;
3580 int skip, mask, i;
3581 Lisp_Object val;
3582
3583 val = Vcoding_category_list;
3584 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585 coding->heading_ascii = skip;
3586
3587 if (!mask) return;
3588
3589 /* We found a single coding system of the highest priority in MASK. */
3590 idx = 0;
3591 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592 if (! mask)
3593 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3594
3595 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597 if (coding->eol_type != CODING_EOL_UNDECIDED)
3598 {
3599 Lisp_Object tmp;
3600
3601 tmp = Fget (val, Qeol_type);
3602 if (VECTORP (tmp))
3603 val = XVECTOR (tmp)->contents[coding->eol_type];
3604 }
3605
3606 /* Setup this new coding system while preserving some slots. */
3607 {
3608 int src_multibyte = coding->src_multibyte;
3609 int dst_multibyte = coding->dst_multibyte;
3610
3611 setup_coding_system (val, coding);
3612 coding->src_multibyte = src_multibyte;
3613 coding->dst_multibyte = dst_multibyte;
3614 coding->heading_ascii = skip;
3615 }
3616 }
3617
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622 How many non-eol characters are at the head is returned as *SKIP. */
3623
3624 #define MAX_EOL_CHECK_COUNT 3
3625
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628 unsigned char *source;
3629 int src_bytes, *skip;
3630 {
3631 unsigned char *src = source, *src_end = src + src_bytes;
3632 unsigned char c;
3633 int total = 0; /* How many end-of-lines are found so far. */
3634 int eol_type = CODING_EOL_UNDECIDED;
3635 int this_eol_type;
3636
3637 *skip = 0;
3638
3639 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3640 {
3641 c = *src++;
3642 if (c == '\n' || c == '\r')
3643 {
3644 if (*skip == 0)
3645 *skip = src - 1 - source;
3646 total++;
3647 if (c == '\n')
3648 this_eol_type = CODING_EOL_LF;
3649 else if (src >= src_end || *src != '\n')
3650 this_eol_type = CODING_EOL_CR;
3651 else
3652 this_eol_type = CODING_EOL_CRLF, src++;
3653
3654 if (eol_type == CODING_EOL_UNDECIDED)
3655 /* This is the first end-of-line. */
3656 eol_type = this_eol_type;
3657 else if (eol_type != this_eol_type)
3658 {
3659 /* The found type is different from what found before. */
3660 eol_type = CODING_EOL_INCONSISTENT;
3661 break;
3662 }
3663 }
3664 }
3665
3666 if (*skip == 0)
3667 *skip = src_end - source;
3668 return eol_type;
3669 }
3670
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672 big-endian/little-endian format for coding systems utf-16-be and
3673 utf-16-le. */
3674
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677 unsigned char *source;
3678 int src_bytes, *skip;
3679 {
3680 unsigned char *src = source, *src_end = src + src_bytes;
3681 unsigned int c1, c2;
3682 int total = 0; /* How many end-of-lines are found so far. */
3683 int eol_type = CODING_EOL_UNDECIDED;
3684 int this_eol_type;
3685 int msb, lsb;
3686
3687 if (big_endian_p)
3688 msb = 0, lsb = 1;
3689 else
3690 msb = 1, lsb = 0;
3691
3692 *skip = 0;
3693
3694 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695 {
3696 c1 = (src[msb] << 8) | (src[lsb]);
3697 src += 2;
3698
3699 if (c1 == '\n' || c1 == '\r')
3700 {
3701 if (*skip == 0)
3702 *skip = src - 2 - source;
3703 total++;
3704 if (c1 == '\n')
3705 {
3706 this_eol_type = CODING_EOL_LF;
3707 }
3708 else
3709 {
3710 if ((src + 1) >= src_end)
3711 {
3712 this_eol_type = CODING_EOL_CR;
3713 }
3714 else
3715 {
3716 c2 = (src[msb] << 8) | (src[lsb]);
3717 if (c2 == '\n')
3718 this_eol_type = CODING_EOL_CRLF, src += 2;
3719 else
3720 this_eol_type = CODING_EOL_CR;
3721 }
3722 }
3723
3724 if (eol_type == CODING_EOL_UNDECIDED)
3725 /* This is the first end-of-line. */
3726 eol_type = this_eol_type;
3727 else if (eol_type != this_eol_type)
3728 {
3729 /* The found type is different from what found before. */
3730 eol_type = CODING_EOL_INCONSISTENT;
3731 break;
3732 }
3733 }
3734 }
3735
3736 if (*skip == 0)
3737 *skip = src_end - source;
3738 return eol_type;
3739 }
3740
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742 is encoded. If it detects an appropriate format of end-of-line, it
3743 sets the information in *CODING. */
3744
3745 void
3746 detect_eol (coding, src, src_bytes)
3747 struct coding_system *coding;
3748 unsigned char *src;
3749 int src_bytes;
3750 {
3751 Lisp_Object val;
3752 int skip;
3753 int eol_type;
3754
3755 switch (coding->category_idx)
3756 {
3757 case CODING_CATEGORY_IDX_UTF_16_BE:
3758 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759 break;
3760 case CODING_CATEGORY_IDX_UTF_16_LE:
3761 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762 break;
3763 default:
3764 eol_type = detect_eol_type (src, src_bytes, &skip);
3765 break;
3766 }
3767
3768 if (coding->heading_ascii > skip)
3769 coding->heading_ascii = skip;
3770 else
3771 skip = coding->heading_ascii;
3772
3773 if (eol_type == CODING_EOL_UNDECIDED)
3774 return;
3775 if (eol_type == CODING_EOL_INCONSISTENT)
3776 {
3777 #if 0
3778 /* This code is suppressed until we find a better way to
3779 distinguish raw text file and binary file. */
3780
3781 /* If we have already detected that the coding is raw-text, the
3782 coding should actually be no-conversion. */
3783 if (coding->type == coding_type_raw_text)
3784 {
3785 setup_coding_system (Qno_conversion, coding);
3786 return;
3787 }
3788 /* Else, let's decode only text code anyway. */
3789 #endif /* 0 */
3790 eol_type = CODING_EOL_LF;
3791 }
3792
3793 val = Fget (coding->symbol, Qeol_type);
3794 if (VECTORP (val) && XVECTOR (val)->size == 3)
3795 {
3796 int src_multibyte = coding->src_multibyte;
3797 int dst_multibyte = coding->dst_multibyte;
3798
3799 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800 coding->src_multibyte = src_multibyte;
3801 coding->dst_multibyte = dst_multibyte;
3802 coding->heading_ascii = skip;
3803 }
3804 }
3805
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
3808 #define DECODING_BUFFER_MAG(coding) \
3809 (coding->type == coding_type_iso2022 \
3810 ? 3 \
3811 : (coding->type == coding_type_ccl \
3812 ? coding->spec.ccl.decoder.buf_magnification \
3813 : 2))
3814
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816 SRC_BYTES of text encoded in CODING. */
3817
3818 int
3819 decoding_buffer_size (coding, src_bytes)
3820 struct coding_system *coding;
3821 int src_bytes;
3822 {
3823 return (src_bytes * DECODING_BUFFER_MAG (coding)
3824 + CONVERSION_BUFFER_EXTRA_ROOM);
3825 }
3826
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828 SRC_BYTES of text to CODING. */
3829
3830 int
3831 encoding_buffer_size (coding, src_bytes)
3832 struct coding_system *coding;
3833 int src_bytes;
3834 {
3835 int magnification;
3836
3837 if (coding->type == coding_type_ccl)
3838 magnification = coding->spec.ccl.encoder.buf_magnification;
3839 else if (CODING_REQUIRE_ENCODING (coding))
3840 magnification = 3;
3841 else
3842 magnification = 1;
3843
3844 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845 }
3846
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3850
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3853
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855 or decoding. Sufficient memory is allocated automatically. If we
3856 run out of memory, return NULL. */
3857
3858 char *
3859 get_conversion_buffer (size)
3860 int size;
3861 {
3862 if (size > conversion_buffer_size)
3863 {
3864 char *buf;
3865 int real_size = conversion_buffer_size * 2;
3866
3867 while (real_size < size) real_size *= 2;
3868 buf = (char *) xmalloc (real_size);
3869 xfree (conversion_buffer);
3870 conversion_buffer = buf;
3871 conversion_buffer_size = real_size;
3872 }
3873 return conversion_buffer;
3874 }
3875
3876 int
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878 struct coding_system *coding;
3879 unsigned char *source, *destination;
3880 int src_bytes, dst_bytes, encodep;
3881 {
3882 struct ccl_program *ccl
3883 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884 int result;
3885
3886 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887
3888 coding->produced = ccl_driver (ccl, source, destination,
3889 src_bytes, dst_bytes, &(coding->consumed));
3890 if (encodep)
3891 coding->produced_char = coding->produced;
3892 else
3893 {
3894 int bytes
3895 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3896 coding->produced = str_as_multibyte (destination, bytes,
3897 coding->produced,
3898 &(coding->produced_char));
3899 }
3900
3901 switch (ccl->status)
3902 {
3903 case CCL_STAT_SUSPEND_BY_SRC:
3904 result = CODING_FINISH_INSUFFICIENT_SRC;
3905 break;
3906 case CCL_STAT_SUSPEND_BY_DST:
3907 result = CODING_FINISH_INSUFFICIENT_DST;
3908 break;
3909 case CCL_STAT_QUIT:
3910 case CCL_STAT_INVALID_CMD:
3911 result = CODING_FINISH_INTERRUPT;
3912 break;
3913 default:
3914 result = CODING_FINISH_NORMAL;
3915 break;
3916 }
3917 return result;
3918 }
3919
3920 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3921 decoding, it may detect coding system and format of end-of-line if
3922 those are not yet decided. The source should be unibyte, the
3923 result is multibyte if CODING->dst_multibyte is nonzero, else
3924 unibyte. */
3925
3926 int
3927 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3928 struct coding_system *coding;
3929 unsigned char *source, *destination;
3930 int src_bytes, dst_bytes;
3931 {
3932 if (coding->type == coding_type_undecided)
3933 detect_coding (coding, source, src_bytes);
3934
3935 if (coding->eol_type == CODING_EOL_UNDECIDED)
3936 detect_eol (coding, source, src_bytes);
3937
3938 coding->produced = coding->produced_char = 0;
3939 coding->consumed = coding->consumed_char = 0;
3940 coding->errors = 0;
3941 coding->result = CODING_FINISH_NORMAL;
3942
3943 switch (coding->type)
3944 {
3945 case coding_type_sjis:
3946 decode_coding_sjis_big5 (coding, source, destination,
3947 src_bytes, dst_bytes, 1);
3948 break;
3949
3950 case coding_type_iso2022:
3951 decode_coding_iso2022 (coding, source, destination,
3952 src_bytes, dst_bytes);
3953 break;
3954
3955 case coding_type_big5:
3956 decode_coding_sjis_big5 (coding, source, destination,
3957 src_bytes, dst_bytes, 0);
3958 break;
3959
3960 case coding_type_emacs_mule:
3961 decode_coding_emacs_mule (coding, source, destination,
3962 src_bytes, dst_bytes);
3963 break;
3964
3965 case coding_type_ccl:
3966 ccl_coding_driver (coding, source, destination,
3967 src_bytes, dst_bytes, 0);
3968 break;
3969
3970 default:
3971 decode_eol (coding, source, destination, src_bytes, dst_bytes);
3972 }
3973
3974 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
3975 && coding->consumed == src_bytes)
3976 coding->result = CODING_FINISH_NORMAL;
3977
3978 if (coding->mode & CODING_MODE_LAST_BLOCK
3979 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
3980 {
3981 unsigned char *src = source + coding->consumed;
3982 unsigned char *dst = destination + coding->produced;
3983
3984 src_bytes -= coding->consumed;
3985 coding->errors++;
3986 if (COMPOSING_P (coding))
3987 DECODE_COMPOSITION_END ('1');
3988 while (src_bytes--)
3989 {
3990 int c = *src++;
3991 dst += CHAR_STRING (c, dst);
3992 coding->produced_char++;
3993 }
3994 coding->consumed = coding->consumed_char = src - source;
3995 coding->produced = dst - destination;
3996 }
3997
3998 if (!coding->dst_multibyte)
3999 {
4000 coding->produced = str_as_unibyte (destination, coding->produced);
4001 coding->produced_char = coding->produced;
4002 }
4003
4004 return coding->result;
4005 }
4006
4007 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4008 multibyteness of the source is CODING->src_multibyte, the
4009 multibyteness of the result is always unibyte. */
4010
4011 int
4012 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4013 struct coding_system *coding;
4014 unsigned char *source, *destination;
4015 int src_bytes, dst_bytes;
4016 {
4017 coding->produced = coding->produced_char = 0;
4018 coding->consumed = coding->consumed_char = 0;
4019 coding->errors = 0;
4020 coding->result = CODING_FINISH_NORMAL;
4021
4022 switch (coding->type)
4023 {
4024 case coding_type_sjis:
4025 encode_coding_sjis_big5 (coding, source, destination,
4026 src_bytes, dst_bytes, 1);
4027 break;
4028
4029 case coding_type_iso2022:
4030 encode_coding_iso2022 (coding, source, destination,
4031 src_bytes, dst_bytes);
4032 break;
4033
4034 case coding_type_big5:
4035 encode_coding_sjis_big5 (coding, source, destination,
4036 src_bytes, dst_bytes, 0);
4037 break;
4038
4039 case coding_type_emacs_mule:
4040 encode_coding_emacs_mule (coding, source, destination,
4041 src_bytes, dst_bytes);
4042 break;
4043
4044 case coding_type_ccl:
4045 ccl_coding_driver (coding, source, destination,
4046 src_bytes, dst_bytes, 1);
4047 break;
4048
4049 default:
4050 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4051 }
4052
4053 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4054 && coding->consumed == src_bytes)
4055 coding->result = CODING_FINISH_NORMAL;
4056
4057 if (coding->mode & CODING_MODE_LAST_BLOCK)
4058 {
4059 unsigned char *src = source + coding->consumed;
4060 unsigned char *src_end = src + src_bytes;
4061 unsigned char *dst = destination + coding->produced;
4062
4063 if (coding->type == coding_type_iso2022)
4064 ENCODE_RESET_PLANE_AND_REGISTER;
4065 if (COMPOSING_P (coding))
4066 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4067 if (coding->consumed < src_bytes)
4068 {
4069 int len = src_bytes - coding->consumed;
4070
4071 BCOPY_SHORT (source + coding->consumed, dst, len);
4072 if (coding->src_multibyte)
4073 len = str_as_unibyte (dst, len);
4074 dst += len;
4075 coding->consumed = src_bytes;
4076 }
4077 coding->produced = coding->produced_char = dst - destination;
4078 }
4079
4080 return coding->result;
4081 }
4082
4083 /* Scan text in the region between *BEG and *END (byte positions),
4084 skip characters which we don't have to decode by coding system
4085 CODING at the head and tail, then set *BEG and *END to the region
4086 of the text we actually have to convert. The caller should move
4087 the gap out of the region in advance if the region is from a
4088 buffer.
4089
4090 If STR is not NULL, *BEG and *END are indices into STR. */
4091
4092 static void
4093 shrink_decoding_region (beg, end, coding, str)
4094 int *beg, *end;
4095 struct coding_system *coding;
4096 unsigned char *str;
4097 {
4098 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4099 int eol_conversion;
4100 Lisp_Object translation_table;
4101
4102 if (coding->type == coding_type_ccl
4103 || coding->type == coding_type_undecided
4104 || coding->eol_type != CODING_EOL_LF
4105 || !NILP (coding->post_read_conversion)
4106 || coding->composing != COMPOSITION_DISABLED)
4107 {
4108 /* We can't skip any data. */
4109 return;
4110 }
4111 if (coding->type == coding_type_no_conversion
4112 || coding->type == coding_type_raw_text
4113 || coding->type == coding_type_emacs_mule)
4114 {
4115 /* We need no conversion, but don't have to skip any data here.
4116 Decoding routine handles them effectively anyway. */
4117 return;
4118 }
4119
4120 translation_table = coding->translation_table_for_decode;
4121 if (NILP (translation_table) && !NILP (Venable_character_translation))
4122 translation_table = Vstandard_translation_table_for_decode;
4123 if (CHAR_TABLE_P (translation_table))
4124 {
4125 int i;
4126 for (i = 0; i < 128; i++)
4127 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4128 break;
4129 if (i < 128)
4130 /* Some ASCII character should be translated. We give up
4131 shrinking. */
4132 return;
4133 }
4134
4135 if (coding->heading_ascii >= 0)
4136 /* Detection routine has already found how much we can skip at the
4137 head. */
4138 *beg += coding->heading_ascii;
4139
4140 if (str)
4141 {
4142 begp_orig = begp = str + *beg;
4143 endp_orig = endp = str + *end;
4144 }
4145 else
4146 {
4147 begp_orig = begp = BYTE_POS_ADDR (*beg);
4148 endp_orig = endp = begp + *end - *beg;
4149 }
4150
4151 eol_conversion = (coding->eol_type == CODING_EOL_CR
4152 || coding->eol_type == CODING_EOL_CRLF);
4153
4154 switch (coding->type)
4155 {
4156 case coding_type_sjis:
4157 case coding_type_big5:
4158 /* We can skip all ASCII characters at the head. */
4159 if (coding->heading_ascii < 0)
4160 {
4161 if (eol_conversion)
4162 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4163 else
4164 while (begp < endp && *begp < 0x80) begp++;
4165 }
4166 /* We can skip all ASCII characters at the tail except for the
4167 second byte of SJIS or BIG5 code. */
4168 if (eol_conversion)
4169 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4170 else
4171 while (begp < endp && endp[-1] < 0x80) endp--;
4172 /* Do not consider LF as ascii if preceded by CR, since that
4173 confuses eol decoding. */
4174 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4175 endp++;
4176 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4177 endp++;
4178 break;
4179
4180 case coding_type_iso2022:
4181 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4182 /* We can't skip any data. */
4183 break;
4184 if (coding->heading_ascii < 0)
4185 {
4186 /* We can skip all ASCII characters at the head except for a
4187 few control codes. */
4188 while (begp < endp && (c = *begp) < 0x80
4189 && c != ISO_CODE_CR && c != ISO_CODE_SO
4190 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4191 && (!eol_conversion || c != ISO_CODE_LF))
4192 begp++;
4193 }
4194 switch (coding->category_idx)
4195 {
4196 case CODING_CATEGORY_IDX_ISO_8_1:
4197 case CODING_CATEGORY_IDX_ISO_8_2:
4198 /* We can skip all ASCII characters at the tail. */
4199 if (eol_conversion)
4200 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4201 else
4202 while (begp < endp && endp[-1] < 0x80) endp--;
4203 /* Do not consider LF as ascii if preceded by CR, since that
4204 confuses eol decoding. */
4205 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4206 endp++;
4207 break;
4208
4209 case CODING_CATEGORY_IDX_ISO_7:
4210 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4211 {
4212 /* We can skip all charactes at the tail except for 8-bit
4213 codes and ESC and the following 2-byte at the tail. */
4214 unsigned char *eight_bit = NULL;
4215
4216 if (eol_conversion)
4217 while (begp < endp
4218 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4219 {
4220 if (!eight_bit && c & 0x80) eight_bit = endp;
4221 endp--;
4222 }
4223 else
4224 while (begp < endp
4225 && (c = endp[-1]) != ISO_CODE_ESC)
4226 {
4227 if (!eight_bit && c & 0x80) eight_bit = endp;
4228 endp--;
4229 }
4230 /* Do not consider LF as ascii if preceded by CR, since that
4231 confuses eol decoding. */
4232 if (begp < endp && endp < endp_orig
4233 && endp[-1] == '\r' && endp[0] == '\n')
4234 endp++;
4235 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4236 {
4237 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4238 /* This is an ASCII designation sequence. We can
4239 surely skip the tail. But, if we have
4240 encountered an 8-bit code, skip only the codes
4241 after that. */
4242 endp = eight_bit ? eight_bit : endp + 2;
4243 else
4244 /* Hmmm, we can't skip the tail. */
4245 endp = endp_orig;
4246 }
4247 else if (eight_bit)
4248 endp = eight_bit;
4249 }
4250 }
4251 break;
4252
4253 default:
4254 abort ();
4255 }
4256 *beg += begp - begp_orig;
4257 *end += endp - endp_orig;
4258 return;
4259 }
4260
4261 /* Like shrink_decoding_region but for encoding. */
4262
4263 static void
4264 shrink_encoding_region (beg, end, coding, str)
4265 int *beg, *end;
4266 struct coding_system *coding;
4267 unsigned char *str;
4268 {
4269 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4270 int eol_conversion;
4271 Lisp_Object translation_table;
4272
4273 if (coding->type == coding_type_ccl
4274 || coding->eol_type == CODING_EOL_CRLF
4275 || coding->eol_type == CODING_EOL_CR
4276 || coding->cmp_data && coding->cmp_data->used > 0)
4277 {
4278 /* We can't skip any data. */
4279 return;
4280 }
4281 if (coding->type == coding_type_no_conversion
4282 || coding->type == coding_type_raw_text
4283 || coding->type == coding_type_emacs_mule
4284 || coding->type == coding_type_undecided)
4285 {
4286 /* We need no conversion, but don't have to skip any data here.
4287 Encoding routine handles them effectively anyway. */
4288 return;
4289 }
4290
4291 translation_table = coding->translation_table_for_encode;
4292 if (NILP (translation_table) && !NILP (Venable_character_translation))
4293 translation_table = Vstandard_translation_table_for_encode;
4294 if (CHAR_TABLE_P (translation_table))
4295 {
4296 int i;
4297 for (i = 0; i < 128; i++)
4298 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4299 break;
4300 if (i < 128)
4301 /* Some ASCII character should be tranlsated. We give up
4302 shrinking. */
4303 return;
4304 }
4305
4306 if (str)
4307 {
4308 begp_orig = begp = str + *beg;
4309 endp_orig = endp = str + *end;
4310 }
4311 else
4312 {
4313 begp_orig = begp = BYTE_POS_ADDR (*beg);
4314 endp_orig = endp = begp + *end - *beg;
4315 }
4316
4317 eol_conversion = (coding->eol_type == CODING_EOL_CR
4318 || coding->eol_type == CODING_EOL_CRLF);
4319
4320 /* Here, we don't have to check coding->pre_write_conversion because
4321 the caller is expected to have handled it already. */
4322 switch (coding->type)
4323 {
4324 case coding_type_iso2022:
4325 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4326 /* We can't skip any data. */
4327 break;
4328 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4329 {
4330 unsigned char *bol = begp;
4331 while (begp < endp && *begp < 0x80)
4332 {
4333 begp++;
4334 if (begp[-1] == '\n')
4335 bol = begp;
4336 }
4337 begp = bol;
4338 goto label_skip_tail;
4339 }
4340 /* fall down ... */
4341
4342 case coding_type_sjis:
4343 case coding_type_big5:
4344 /* We can skip all ASCII characters at the head and tail. */
4345 if (eol_conversion)
4346 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4347 else
4348 while (begp < endp && *begp < 0x80) begp++;
4349 label_skip_tail:
4350 if (eol_conversion)
4351 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4352 else
4353 while (begp < endp && *(endp - 1) < 0x80) endp--;
4354 break;
4355
4356 default:
4357 abort ();
4358 }
4359
4360 *beg += begp - begp_orig;
4361 *end += endp - endp_orig;
4362 return;
4363 }
4364
4365 /* As shrinking conversion region requires some overhead, we don't try
4366 shrinking if the length of conversion region is less than this
4367 value. */
4368 static int shrink_conversion_region_threshhold = 1024;
4369
4370 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4371 do { \
4372 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4373 { \
4374 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4375 else shrink_decoding_region (beg, end, coding, str); \
4376 } \
4377 } while (0)
4378
4379 static Lisp_Object
4380 code_convert_region_unwind (dummy)
4381 Lisp_Object dummy;
4382 {
4383 inhibit_pre_post_conversion = 0;
4384 return Qnil;
4385 }
4386
4387 /* Store information about all compositions in the range FROM and TO
4388 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4389 buffer or a string, defaults to the current buffer. */
4390
4391 void
4392 coding_save_composition (coding, from, to, obj)
4393 struct coding_system *coding;
4394 int from, to;
4395 Lisp_Object obj;
4396 {
4397 Lisp_Object prop;
4398 int start, end;
4399
4400 if (coding->composing == COMPOSITION_DISABLED)
4401 return;
4402 if (!coding->cmp_data)
4403 coding_allocate_composition_data (coding, from);
4404 if (!find_composition (from, to, &start, &end, &prop, obj)
4405 || end > to)
4406 return;
4407 if (start < from
4408 && (!find_composition (end, to, &start, &end, &prop, obj)
4409 || end > to))
4410 return;
4411 coding->composing = COMPOSITION_NO;
4412 do
4413 {
4414 if (COMPOSITION_VALID_P (start, end, prop))
4415 {
4416 enum composition_method method = COMPOSITION_METHOD (prop);
4417 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4418 >= COMPOSITION_DATA_SIZE)
4419 coding_allocate_composition_data (coding, from);
4420 /* For relative composition, we remember start and end
4421 positions, for the other compositions, we also remember
4422 components. */
4423 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4424 if (method != COMPOSITION_RELATIVE)
4425 {
4426 /* We must store a*/
4427 Lisp_Object val, ch;
4428
4429 val = COMPOSITION_COMPONENTS (prop);
4430 if (CONSP (val))
4431 while (CONSP (val))
4432 {
4433 ch = XCAR (val), val = XCDR (val);
4434 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4435 }
4436 else if (VECTORP (val) || STRINGP (val))
4437 {
4438 int len = (VECTORP (val)
4439 ? XVECTOR (val)->size : XSTRING (val)->size);
4440 int i;
4441 for (i = 0; i < len; i++)
4442 {
4443 ch = (STRINGP (val)
4444 ? Faref (val, make_number (i))
4445 : XVECTOR (val)->contents[i]);
4446 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4447 }
4448 }
4449 else /* INTEGERP (val) */
4450 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4451 }
4452 CODING_ADD_COMPOSITION_END (coding, end - from);
4453 }
4454 start = end;
4455 }
4456 while (start < to
4457 && find_composition (start, to, &start, &end, &prop, obj)
4458 && end <= to);
4459
4460 /* Make coding->cmp_data point to the first memory block. */
4461 while (coding->cmp_data->prev)
4462 coding->cmp_data = coding->cmp_data->prev;
4463 coding->cmp_data_start = 0;
4464 }
4465
4466 /* Reflect the saved information about compositions to OBJ.
4467 CODING->cmp_data points to a memory block for the informaiton. OBJ
4468 is a buffer or a string, defaults to the current buffer. */
4469
4470 void
4471 coding_restore_composition (coding, obj)
4472 struct coding_system *coding;
4473 Lisp_Object obj;
4474 {
4475 struct composition_data *cmp_data = coding->cmp_data;
4476
4477 if (!cmp_data)
4478 return;
4479
4480 while (cmp_data->prev)
4481 cmp_data = cmp_data->prev;
4482
4483 while (cmp_data)
4484 {
4485 int i;
4486
4487 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4488 {
4489 int *data = cmp_data->data + i;
4490 enum composition_method method = (enum composition_method) data[3];
4491 Lisp_Object components;
4492
4493 if (method == COMPOSITION_RELATIVE)
4494 components = Qnil;
4495 else
4496 {
4497 int len = data[0] - 4, j;
4498 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4499
4500 for (j = 0; j < len; j++)
4501 args[j] = make_number (data[4 + j]);
4502 components = (method == COMPOSITION_WITH_ALTCHARS
4503 ? Fstring (len, args) : Fvector (len, args));
4504 }
4505 compose_text (data[1], data[2], components, Qnil, obj);
4506 }
4507 cmp_data = cmp_data->next;
4508 }
4509 }
4510
4511 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4512 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4513 coding system CODING, and return the status code of code conversion
4514 (currently, this value has no meaning).
4515
4516 How many characters (and bytes) are converted to how many
4517 characters (and bytes) are recorded in members of the structure
4518 CODING.
4519
4520 If REPLACE is nonzero, we do various things as if the original text
4521 is deleted and a new text is inserted. See the comments in
4522 replace_range (insdel.c) to know what we are doing.
4523
4524 If REPLACE is zero, it is assumed that the source text is unibyte.
4525 Otherwize, it is assumed that the source text is multibyte. */
4526
4527 int
4528 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4529 int from, from_byte, to, to_byte, encodep, replace;
4530 struct coding_system *coding;
4531 {
4532 int len = to - from, len_byte = to_byte - from_byte;
4533 int require, inserted, inserted_byte;
4534 int head_skip, tail_skip, total_skip = 0;
4535 Lisp_Object saved_coding_symbol;
4536 int first = 1;
4537 unsigned char *src, *dst;
4538 Lisp_Object deletion;
4539 int orig_point = PT, orig_len = len;
4540 int prev_Z;
4541 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4542
4543 coding->src_multibyte = replace && multibyte_p;
4544 coding->dst_multibyte = multibyte_p;
4545
4546 deletion = Qnil;
4547 saved_coding_symbol = Qnil;
4548
4549 if (from < PT && PT < to)
4550 {
4551 TEMP_SET_PT_BOTH (from, from_byte);
4552 orig_point = from;
4553 }
4554
4555 if (replace)
4556 {
4557 int saved_from = from;
4558
4559 prepare_to_modify_buffer (from, to, &from);
4560 if (saved_from != from)
4561 {
4562 to = from + len;
4563 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4564 len_byte = to_byte - from_byte;
4565 }
4566 }
4567
4568 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4569 {
4570 /* We must detect encoding of text and eol format. */
4571
4572 if (from < GPT && to > GPT)
4573 move_gap_both (from, from_byte);
4574 if (coding->type == coding_type_undecided)
4575 {
4576 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4577 if (coding->type == coding_type_undecided)
4578 /* It seems that the text contains only ASCII, but we
4579 should not left it undecided because the deeper
4580 decoding routine (decode_coding) tries to detect the
4581 encodings again in vain. */
4582 coding->type = coding_type_emacs_mule;
4583 }
4584 if (coding->eol_type == CODING_EOL_UNDECIDED)
4585 {
4586 saved_coding_symbol = coding->symbol;
4587 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4588 if (coding->eol_type == CODING_EOL_UNDECIDED)
4589 coding->eol_type = CODING_EOL_LF;
4590 /* We had better recover the original eol format if we
4591 encounter an inconsitent eol format while decoding. */
4592 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4593 }
4594 }
4595
4596 /* Now we convert the text. */
4597
4598 /* For encoding, we must process pre-write-conversion in advance. */
4599 if (! inhibit_pre_post_conversion
4600 && encodep
4601 && SYMBOLP (coding->pre_write_conversion)
4602 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4603 {
4604 /* The function in pre-write-conversion may put a new text in a
4605 new buffer. */
4606 struct buffer *prev = current_buffer;
4607 Lisp_Object new;
4608 int count = specpdl_ptr - specpdl;
4609
4610 record_unwind_protect (code_convert_region_unwind, Qnil);
4611 /* We should not call any more pre-write/post-read-conversion
4612 functions while this pre-write-conversion is running. */
4613 inhibit_pre_post_conversion = 1;
4614 call2 (coding->pre_write_conversion,
4615 make_number (from), make_number (to));
4616 inhibit_pre_post_conversion = 0;
4617 /* Discard the unwind protect. */
4618 specpdl_ptr--;
4619
4620 if (current_buffer != prev)
4621 {
4622 len = ZV - BEGV;
4623 new = Fcurrent_buffer ();
4624 set_buffer_internal_1 (prev);
4625 del_range_2 (from, from_byte, to, to_byte, 0);
4626 TEMP_SET_PT_BOTH (from, from_byte);
4627 insert_from_buffer (XBUFFER (new), 1, len, 0);
4628 Fkill_buffer (new);
4629 if (orig_point >= to)
4630 orig_point += len - orig_len;
4631 else if (orig_point > from)
4632 orig_point = from;
4633 orig_len = len;
4634 to = from + len;
4635 from_byte = CHAR_TO_BYTE (from);
4636 to_byte = CHAR_TO_BYTE (to);
4637 len_byte = to_byte - from_byte;
4638 TEMP_SET_PT_BOTH (from, from_byte);
4639 }
4640 }
4641
4642 if (replace)
4643 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4644
4645 if (coding->composing != COMPOSITION_DISABLED)
4646 {
4647 if (encodep)
4648 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4649 else
4650 coding_allocate_composition_data (coding, from);
4651 }
4652
4653 /* Try to skip the heading and tailing ASCIIs. */
4654 {
4655 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4656
4657 if (from < GPT && GPT < to)
4658 move_gap_both (from, from_byte);
4659 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4660 if (from_byte == to_byte
4661 && (encodep || NILP (coding->post_read_conversion))
4662 && ! CODING_REQUIRE_FLUSHING (coding))
4663 {
4664 coding->produced = len_byte;
4665 coding->produced_char = len;
4666 if (!replace)
4667 /* We must record and adjust for this new text now. */
4668 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4669 return 0;
4670 }
4671
4672 head_skip = from_byte - from_byte_orig;
4673 tail_skip = to_byte_orig - to_byte;
4674 total_skip = head_skip + tail_skip;
4675 from += head_skip;
4676 to -= tail_skip;
4677 len -= total_skip; len_byte -= total_skip;
4678 }
4679
4680 /* The code conversion routine can not preserve text properties for
4681 now. So, we must remove all text properties in the region.
4682 Here, we must suppress all modification hooks. */
4683 if (replace)
4684 {
4685 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4686 inhibit_modification_hooks = 1;
4687 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4688 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4689 }
4690
4691 /* For converion, we must put the gap before the text in addition to
4692 making the gap larger for efficient decoding. The required gap
4693 size starts from 2000 which is the magic number used in make_gap.
4694 But, after one batch of conversion, it will be incremented if we
4695 find that it is not enough . */
4696 require = 2000;
4697
4698 if (GAP_SIZE < require)
4699 make_gap (require - GAP_SIZE);
4700 move_gap_both (from, from_byte);
4701
4702 inserted = inserted_byte = 0;
4703
4704 GAP_SIZE += len_byte;
4705 ZV -= len;
4706 Z -= len;
4707 ZV_BYTE -= len_byte;
4708 Z_BYTE -= len_byte;
4709
4710 if (GPT - BEG < BEG_UNCHANGED)
4711 BEG_UNCHANGED = GPT - BEG;
4712 if (Z - GPT < END_UNCHANGED)
4713 END_UNCHANGED = Z - GPT;
4714
4715 if (!encodep && coding->src_multibyte)
4716 {
4717 /* Decoding routines expects that the source text is unibyte.
4718 We must convert 8-bit characters of multibyte form to
4719 unibyte. */
4720 int len_byte_orig = len_byte;
4721 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4722 if (len_byte < len_byte_orig)
4723 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4724 len_byte);
4725 coding->src_multibyte = 0;
4726 }
4727
4728 for (;;)
4729 {
4730 int result;
4731
4732 /* The buffer memory is now:
4733 +--------+converted-text+---------+-------original-text-------+---+
4734 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4735 |<---------------------- GAP ----------------------->| */
4736 src = GAP_END_ADDR - len_byte;
4737 dst = GPT_ADDR + inserted_byte;
4738
4739 if (encodep)
4740 result = encode_coding (coding, src, dst, len_byte, 0);
4741 else
4742 result = decode_coding (coding, src, dst, len_byte, 0);
4743
4744 /* The buffer memory is now:
4745 +--------+-------converted-text----+--+------original-text----+---+
4746 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4747 |<---------------------- GAP ----------------------->| */
4748
4749 inserted += coding->produced_char;
4750 inserted_byte += coding->produced;
4751 len_byte -= coding->consumed;
4752
4753 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4754 {
4755 coding_allocate_composition_data (coding, from + inserted);
4756 continue;
4757 }
4758
4759 src += coding->consumed;
4760 dst += coding->produced;
4761
4762 if (result == CODING_FINISH_NORMAL)
4763 {
4764 src += len_byte;
4765 break;
4766 }
4767 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4768 {
4769 unsigned char *pend = dst, *p = pend - inserted_byte;
4770 Lisp_Object eol_type;
4771
4772 /* Encode LFs back to the original eol format (CR or CRLF). */
4773 if (coding->eol_type == CODING_EOL_CR)
4774 {
4775 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4776 }
4777 else
4778 {
4779 int count = 0;
4780
4781 while (p < pend) if (*p++ == '\n') count++;
4782 if (src - dst < count)
4783 {
4784 /* We don't have sufficient room for encoding LFs
4785 back to CRLF. We must record converted and
4786 not-yet-converted text back to the buffer
4787 content, enlarge the gap, then record them out of
4788 the buffer contents again. */
4789 int add = len_byte + inserted_byte;
4790
4791 GAP_SIZE -= add;
4792 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4793 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4794 make_gap (count - GAP_SIZE);
4795 GAP_SIZE += add;
4796 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4797 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4798 /* Don't forget to update SRC, DST, and PEND. */
4799 src = GAP_END_ADDR - len_byte;
4800 dst = GPT_ADDR + inserted_byte;
4801 pend = dst;
4802 }
4803 inserted += count;
4804 inserted_byte += count;
4805 coding->produced += count;
4806 p = dst = pend + count;
4807 while (count)
4808 {
4809 *--p = *--pend;
4810 if (*p == '\n') count--, *--p = '\r';
4811 }
4812 }
4813
4814 /* Suppress eol-format conversion in the further conversion. */
4815 coding->eol_type = CODING_EOL_LF;
4816
4817 /* Set the coding system symbol to that for Unix-like EOL. */
4818 eol_type = Fget (saved_coding_symbol, Qeol_type);
4819 if (VECTORP (eol_type)
4820 && XVECTOR (eol_type)->size == 3
4821 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4822 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4823 else
4824 coding->symbol = saved_coding_symbol;
4825
4826 continue;
4827 }
4828 if (len_byte <= 0)
4829 {
4830 if (coding->type != coding_type_ccl
4831 || coding->mode & CODING_MODE_LAST_BLOCK)
4832 break;
4833 coding->mode |= CODING_MODE_LAST_BLOCK;
4834 continue;
4835 }
4836 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4837 {
4838 /* The source text ends in invalid codes. Let's just
4839 make them valid buffer contents, and finish conversion. */
4840 inserted += len_byte;
4841 inserted_byte += len_byte;
4842 while (len_byte--)
4843 *dst++ = *src++;
4844 break;
4845 }
4846 if (result == CODING_FINISH_INTERRUPT)
4847 {
4848 /* The conversion procedure was interrupted by a user. */
4849 break;
4850 }
4851 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4852 if (coding->consumed < 1)
4853 {
4854 /* It's quite strange to require more memory without
4855 consuming any bytes. Perhaps CCL program bug. */
4856 break;
4857 }
4858 if (first)
4859 {
4860 /* We have just done the first batch of conversion which was
4861 stoped because of insufficient gap. Let's reconsider the
4862 required gap size (i.e. SRT - DST) now.
4863
4864 We have converted ORIG bytes (== coding->consumed) into
4865 NEW bytes (coding->produced). To convert the remaining
4866 LEN bytes, we may need REQUIRE bytes of gap, where:
4867 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4868 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4869 Here, we are sure that NEW >= ORIG. */
4870 float ratio = coding->produced - coding->consumed;
4871 ratio /= coding->consumed;
4872 require = len_byte * ratio;
4873 first = 0;
4874 }
4875 if ((src - dst) < (require + 2000))
4876 {
4877 /* See the comment above the previous call of make_gap. */
4878 int add = len_byte + inserted_byte;
4879
4880 GAP_SIZE -= add;
4881 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4882 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4883 make_gap (require + 2000);
4884 GAP_SIZE += add;
4885 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4886 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4887 }
4888 }
4889 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4890
4891 if (encodep && coding->dst_multibyte)
4892 {
4893 /* The output is unibyte. We must convert 8-bit characters to
4894 multibyte form. */
4895 if (inserted_byte * 2 > GAP_SIZE)
4896 {
4897 GAP_SIZE -= inserted_byte;
4898 ZV += inserted_byte; Z += inserted_byte;
4899 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4900 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4901 make_gap (inserted_byte - GAP_SIZE);
4902 GAP_SIZE += inserted_byte;
4903 ZV -= inserted_byte; Z -= inserted_byte;
4904 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
4905 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4906 }
4907 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
4908 }
4909
4910 /* If we have shrinked the conversion area, adjust it now. */
4911 if (total_skip > 0)
4912 {
4913 if (tail_skip > 0)
4914 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4915 inserted += total_skip; inserted_byte += total_skip;
4916 GAP_SIZE += total_skip;
4917 GPT -= head_skip; GPT_BYTE -= head_skip;
4918 ZV -= total_skip; ZV_BYTE -= total_skip;
4919 Z -= total_skip; Z_BYTE -= total_skip;
4920 from -= head_skip; from_byte -= head_skip;
4921 to += tail_skip; to_byte += tail_skip;
4922 }
4923
4924 prev_Z = Z;
4925 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4926 inserted = Z - prev_Z;
4927
4928 if (!encodep && coding->cmp_data && coding->cmp_data->used)
4929 coding_restore_composition (coding, Fcurrent_buffer ());
4930 coding_free_composition_data (coding);
4931
4932 if (! inhibit_pre_post_conversion
4933 && ! encodep && ! NILP (coding->post_read_conversion))
4934 {
4935 Lisp_Object val;
4936 int count = specpdl_ptr - specpdl;
4937
4938 if (from != PT)
4939 TEMP_SET_PT_BOTH (from, from_byte);
4940 prev_Z = Z;
4941 record_unwind_protect (code_convert_region_unwind, Qnil);
4942 /* We should not call any more pre-write/post-read-conversion
4943 functions while this post-read-conversion is running. */
4944 inhibit_pre_post_conversion = 1;
4945 val = call1 (coding->post_read_conversion, make_number (inserted));
4946 inhibit_pre_post_conversion = 0;
4947 /* Discard the unwind protect. */
4948 specpdl_ptr--;
4949 CHECK_NUMBER (val, 0);
4950 inserted += Z - prev_Z;
4951 }
4952
4953 if (orig_point >= from)
4954 {
4955 if (orig_point >= from + orig_len)
4956 orig_point += inserted - orig_len;
4957 else
4958 orig_point = from;
4959 TEMP_SET_PT (orig_point);
4960 }
4961
4962 if (replace)
4963 {
4964 signal_after_change (from, to - from, inserted);
4965 update_compositions (from, from + inserted, CHECK_BORDER);
4966 }
4967
4968 {
4969 coding->consumed = to_byte - from_byte;
4970 coding->consumed_char = to - from;
4971 coding->produced = inserted_byte;
4972 coding->produced_char = inserted;
4973 }
4974
4975 return 0;
4976 }
4977
4978 Lisp_Object
4979 run_pre_post_conversion_on_str (str, coding, encodep)
4980 Lisp_Object str;
4981 struct coding_system *coding;
4982 int encodep;
4983 {
4984 int count = specpdl_ptr - specpdl;
4985 struct gcpro gcpro1;
4986 struct buffer *prev = current_buffer;
4987 int multibyte = STRING_MULTIBYTE (str);
4988
4989 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4990 record_unwind_protect (code_convert_region_unwind, Qnil);
4991 GCPRO1 (str);
4992 temp_output_buffer_setup (" *code-converting-work*");
4993 set_buffer_internal (XBUFFER (Vstandard_output));
4994 /* We must insert the contents of STR as is without
4995 unibyte<->multibyte conversion. For that, we adjust the
4996 multibyteness of the working buffer to that of STR. */
4997 Ferase_buffer ();
4998 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4999 insert_from_string (str, 0, 0,
5000 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5001 UNGCPRO;
5002 inhibit_pre_post_conversion = 1;
5003 if (encodep)
5004 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5005 else
5006 {
5007 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5008 call1 (coding->post_read_conversion, make_number (Z - BEG));
5009 }
5010 inhibit_pre_post_conversion = 0;
5011 str = make_buffer_string (BEG, Z, 0);
5012 return unbind_to (count, str);
5013 }
5014
5015 Lisp_Object
5016 decode_coding_string (str, coding, nocopy)
5017 Lisp_Object str;
5018 struct coding_system *coding;
5019 int nocopy;
5020 {
5021 int len;
5022 char *buf;
5023 int from, to, to_byte;
5024 struct gcpro gcpro1;
5025 Lisp_Object saved_coding_symbol;
5026 int result;
5027
5028 from = 0;
5029 to = XSTRING (str)->size;
5030 to_byte = STRING_BYTES (XSTRING (str));
5031
5032 saved_coding_symbol = Qnil;
5033 if (CODING_REQUIRE_DETECTION (coding))
5034 {
5035 /* See the comments in code_convert_region. */
5036 if (coding->type == coding_type_undecided)
5037 {
5038 detect_coding (coding, XSTRING (str)->data, to_byte);
5039 if (coding->type == coding_type_undecided)
5040 coding->type = coding_type_emacs_mule;
5041 }
5042 if (coding->eol_type == CODING_EOL_UNDECIDED)
5043 {
5044 saved_coding_symbol = coding->symbol;
5045 detect_eol (coding, XSTRING (str)->data, to_byte);
5046 if (coding->eol_type == CODING_EOL_UNDECIDED)
5047 coding->eol_type = CODING_EOL_LF;
5048 /* We had better recover the original eol format if we
5049 encounter an inconsitent eol format while decoding. */
5050 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5051 }
5052 }
5053
5054 if (! CODING_REQUIRE_DECODING (coding))
5055 {
5056 if (!STRING_MULTIBYTE (str))
5057 {
5058 str = Fstring_as_multibyte (str);
5059 nocopy = 1;
5060 }
5061 return (nocopy ? str : Fcopy_sequence (str));
5062 }
5063
5064 if (STRING_MULTIBYTE (str))
5065 {
5066 /* Decoding routines expect the source text to be unibyte. */
5067 str = Fstring_as_unibyte (str);
5068 nocopy = 1;
5069 coding->src_multibyte = 0;
5070 }
5071 coding->dst_multibyte = 1;
5072
5073 if (coding->composing != COMPOSITION_DISABLED)
5074 coding_allocate_composition_data (coding, from);
5075
5076 /* Try to skip the heading and tailing ASCIIs. */
5077 {
5078 int from_orig = from;
5079
5080 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5081 0);
5082 if (from == to_byte)
5083 return (nocopy ? str : Fcopy_sequence (str));
5084 }
5085
5086 len = decoding_buffer_size (coding, to_byte - from);
5087 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5088 GCPRO1 (str);
5089 buf = get_conversion_buffer (len);
5090 UNGCPRO;
5091
5092 if (from > 0)
5093 bcopy (XSTRING (str)->data, buf, from);
5094 result = decode_coding (coding, XSTRING (str)->data + from,
5095 buf + from, to_byte - from, len);
5096 if (result == CODING_FINISH_INCONSISTENT_EOL)
5097 {
5098 /* We simply try to decode the whole string again but without
5099 eol-conversion this time. */
5100 coding->eol_type = CODING_EOL_LF;
5101 coding->symbol = saved_coding_symbol;
5102 coding_free_composition_data (coding);
5103 return decode_coding_string (str, coding, nocopy);
5104 }
5105
5106 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5107 STRING_BYTES (XSTRING (str)) - to_byte);
5108
5109 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5110 str = make_multibyte_string (buf, len + coding->produced_char,
5111 len + coding->produced);
5112
5113 if (coding->cmp_data && coding->cmp_data->used)
5114 coding_restore_composition (coding, str);
5115 coding_free_composition_data (coding);
5116
5117 if (SYMBOLP (coding->post_read_conversion)
5118 && !NILP (Ffboundp (coding->post_read_conversion)))
5119 str = run_pre_post_conversion_on_str (str, coding, 0);
5120
5121 return str;
5122 }
5123
5124 Lisp_Object
5125 encode_coding_string (str, coding, nocopy)
5126 Lisp_Object str;
5127 struct coding_system *coding;
5128 int nocopy;
5129 {
5130 int len;
5131 char *buf;
5132 int from, to, to_byte;
5133 struct gcpro gcpro1;
5134 Lisp_Object saved_coding_symbol;
5135 int result;
5136
5137 if (SYMBOLP (coding->pre_write_conversion)
5138 && !NILP (Ffboundp (coding->pre_write_conversion)))
5139 str = run_pre_post_conversion_on_str (str, coding, 1);
5140
5141 from = 0;
5142 to = XSTRING (str)->size;
5143 to_byte = STRING_BYTES (XSTRING (str));
5144
5145 saved_coding_symbol = Qnil;
5146 if (! CODING_REQUIRE_ENCODING (coding))
5147 {
5148 if (STRING_MULTIBYTE (str))
5149 {
5150 str = Fstring_as_unibyte (str);
5151 nocopy = 1;
5152 }
5153 return (nocopy ? str : Fcopy_sequence (str));
5154 }
5155
5156 /* Encoding routines determine the multibyteness of the source text
5157 by coding->src_multibyte. */
5158 coding->src_multibyte = STRING_MULTIBYTE (str);
5159 coding->dst_multibyte = 0;
5160
5161 if (coding->composing != COMPOSITION_DISABLED)
5162 coding_save_composition (coding, from, to, str);
5163
5164 /* Try to skip the heading and tailing ASCIIs. */
5165 {
5166 int from_orig = from;
5167
5168 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5169 1);
5170 if (from == to_byte)
5171 return (nocopy ? str : Fcopy_sequence (str));
5172 }
5173
5174 len = encoding_buffer_size (coding, to_byte - from);
5175 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5176 GCPRO1 (str);
5177 buf = get_conversion_buffer (len);
5178 UNGCPRO;
5179
5180 if (from > 0)
5181 bcopy (XSTRING (str)->data, buf, from);
5182 result = encode_coding (coding, XSTRING (str)->data + from,
5183 buf + from, to_byte - from, len);
5184 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5185 STRING_BYTES (XSTRING (str)) - to_byte);
5186
5187 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5188 str = make_unibyte_string (buf, len + coding->produced);
5189 coding_free_composition_data (coding);
5190
5191 return str;
5192 }
5193
5194 \f
5195 #ifdef emacs
5196 /*** 8. Emacs Lisp library functions ***/
5197
5198 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5199 "Return t if OBJECT is nil or a coding-system.\n\
5200 See the documentation of `make-coding-system' for information\n\
5201 about coding-system objects.")
5202 (obj)
5203 Lisp_Object obj;
5204 {
5205 if (NILP (obj))
5206 return Qt;
5207 if (!SYMBOLP (obj))
5208 return Qnil;
5209 /* Get coding-spec vector for OBJ. */
5210 obj = Fget (obj, Qcoding_system);
5211 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5212 ? Qt : Qnil);
5213 }
5214
5215 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5216 Sread_non_nil_coding_system, 1, 1, 0,
5217 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5218 (prompt)
5219 Lisp_Object prompt;
5220 {
5221 Lisp_Object val;
5222 do
5223 {
5224 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5225 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5226 }
5227 while (XSTRING (val)->size == 0);
5228 return (Fintern (val, Qnil));
5229 }
5230
5231 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5232 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5233 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5234 (prompt, default_coding_system)
5235 Lisp_Object prompt, default_coding_system;
5236 {
5237 Lisp_Object val;
5238 if (SYMBOLP (default_coding_system))
5239 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5240 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5241 Qt, Qnil, Qcoding_system_history,
5242 default_coding_system, Qnil);
5243 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5244 }
5245
5246 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5247 1, 1, 0,
5248 "Check validity of CODING-SYSTEM.\n\
5249 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5250 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5251 The value of property should be a vector of length 5.")
5252 (coding_system)
5253 Lisp_Object coding_system;
5254 {
5255 CHECK_SYMBOL (coding_system, 0);
5256 if (!NILP (Fcoding_system_p (coding_system)))
5257 return coding_system;
5258 while (1)
5259 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5260 }
5261 \f
5262 Lisp_Object
5263 detect_coding_system (src, src_bytes, highest)
5264 unsigned char *src;
5265 int src_bytes, highest;
5266 {
5267 int coding_mask, eol_type;
5268 Lisp_Object val, tmp;
5269 int dummy;
5270
5271 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5272 eol_type = detect_eol_type (src, src_bytes, &dummy);
5273 if (eol_type == CODING_EOL_INCONSISTENT)
5274 eol_type = CODING_EOL_UNDECIDED;
5275
5276 if (!coding_mask)
5277 {
5278 val = Qundecided;
5279 if (eol_type != CODING_EOL_UNDECIDED)
5280 {
5281 Lisp_Object val2;
5282 val2 = Fget (Qundecided, Qeol_type);
5283 if (VECTORP (val2))
5284 val = XVECTOR (val2)->contents[eol_type];
5285 }
5286 return (highest ? val : Fcons (val, Qnil));
5287 }
5288
5289 /* At first, gather possible coding systems in VAL. */
5290 val = Qnil;
5291 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5292 {
5293 Lisp_Object category_val, category_index;
5294
5295 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5296 category_val = Fsymbol_value (XCAR (tmp));
5297 if (!NILP (category_val)
5298 && NATNUMP (category_index)
5299 && (coding_mask & (1 << XFASTINT (category_index))))
5300 {
5301 val = Fcons (category_val, val);
5302 if (highest)
5303 break;
5304 }
5305 }
5306 if (!highest)
5307 val = Fnreverse (val);
5308
5309 /* Then, replace the elements with subsidiary coding systems. */
5310 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5311 {
5312 if (eol_type != CODING_EOL_UNDECIDED
5313 && eol_type != CODING_EOL_INCONSISTENT)
5314 {
5315 Lisp_Object eol;
5316 eol = Fget (XCAR (tmp), Qeol_type);
5317 if (VECTORP (eol))
5318 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5319 }
5320 }
5321 return (highest ? XCAR (val) : val);
5322 }
5323
5324 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5325 2, 3, 0,
5326 "Detect coding system of the text in the region between START and END.\n\
5327 Return a list of possible coding systems ordered by priority.\n\
5328 \n\
5329 If only ASCII characters are found, it returns a list of single element\n\
5330 `undecided' or its subsidiary coding system according to a detected\n\
5331 end-of-line format.\n\
5332 \n\
5333 If optional argument HIGHEST is non-nil, return the coding system of\n\
5334 highest priority.")
5335 (start, end, highest)
5336 Lisp_Object start, end, highest;
5337 {
5338 int from, to;
5339 int from_byte, to_byte;
5340
5341 CHECK_NUMBER_COERCE_MARKER (start, 0);
5342 CHECK_NUMBER_COERCE_MARKER (end, 1);
5343
5344 validate_region (&start, &end);
5345 from = XINT (start), to = XINT (end);
5346 from_byte = CHAR_TO_BYTE (from);
5347 to_byte = CHAR_TO_BYTE (to);
5348
5349 if (from < GPT && to >= GPT)
5350 move_gap_both (to, to_byte);
5351
5352 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5353 to_byte - from_byte,
5354 !NILP (highest));
5355 }
5356
5357 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5358 1, 2, 0,
5359 "Detect coding system of the text in STRING.\n\
5360 Return a list of possible coding systems ordered by priority.\n\
5361 \n\
5362 If only ASCII characters are found, it returns a list of single element\n\
5363 `undecided' or its subsidiary coding system according to a detected\n\
5364 end-of-line format.\n\
5365 \n\
5366 If optional argument HIGHEST is non-nil, return the coding system of\n\
5367 highest priority.")
5368 (string, highest)
5369 Lisp_Object string, highest;
5370 {
5371 CHECK_STRING (string, 0);
5372
5373 return detect_coding_system (XSTRING (string)->data,
5374 STRING_BYTES (XSTRING (string)),
5375 !NILP (highest));
5376 }
5377
5378 Lisp_Object
5379 code_convert_region1 (start, end, coding_system, encodep)
5380 Lisp_Object start, end, coding_system;
5381 int encodep;
5382 {
5383 struct coding_system coding;
5384 int from, to, len;
5385
5386 CHECK_NUMBER_COERCE_MARKER (start, 0);
5387 CHECK_NUMBER_COERCE_MARKER (end, 1);
5388 CHECK_SYMBOL (coding_system, 2);
5389
5390 validate_region (&start, &end);
5391 from = XFASTINT (start);
5392 to = XFASTINT (end);
5393
5394 if (NILP (coding_system))
5395 return make_number (to - from);
5396
5397 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5398 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5399
5400 coding.mode |= CODING_MODE_LAST_BLOCK;
5401 coding.src_multibyte = coding.dst_multibyte
5402 = !NILP (current_buffer->enable_multibyte_characters);
5403 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5404 &coding, encodep, 1);
5405 Vlast_coding_system_used = coding.symbol;
5406 return make_number (coding.produced_char);
5407 }
5408
5409 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5410 3, 3, "r\nzCoding system: ",
5411 "Decode the current region by specified coding system.\n\
5412 When called from a program, takes three arguments:\n\
5413 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5414 This function sets `last-coding-system-used' to the precise coding system\n\
5415 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5416 not fully specified.)\n\
5417 It returns the length of the decoded text.")
5418 (start, end, coding_system)
5419 Lisp_Object start, end, coding_system;
5420 {
5421 return code_convert_region1 (start, end, coding_system, 0);
5422 }
5423
5424 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5425 3, 3, "r\nzCoding system: ",
5426 "Encode the current region by specified coding system.\n\
5427 When called from a program, takes three arguments:\n\
5428 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5429 This function sets `last-coding-system-used' to the precise coding system\n\
5430 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5431 not fully specified.)\n\
5432 It returns the length of the encoded text.")
5433 (start, end, coding_system)
5434 Lisp_Object start, end, coding_system;
5435 {
5436 return code_convert_region1 (start, end, coding_system, 1);
5437 }
5438
5439 Lisp_Object
5440 code_convert_string1 (string, coding_system, nocopy, encodep)
5441 Lisp_Object string, coding_system, nocopy;
5442 int encodep;
5443 {
5444 struct coding_system coding;
5445
5446 CHECK_STRING (string, 0);
5447 CHECK_SYMBOL (coding_system, 1);
5448
5449 if (NILP (coding_system))
5450 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5451
5452 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5453 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5454
5455 coding.mode |= CODING_MODE_LAST_BLOCK;
5456 string = (encodep
5457 ? encode_coding_string (string, &coding, !NILP (nocopy))
5458 : decode_coding_string (string, &coding, !NILP (nocopy)));
5459 Vlast_coding_system_used = coding.symbol;
5460
5461 return string;
5462 }
5463
5464 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5465 2, 3, 0,
5466 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5467 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5468 if the decoding operation is trivial.\n\
5469 This function sets `last-coding-system-used' to the precise coding system\n\
5470 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5471 not fully specified.)")
5472 (string, coding_system, nocopy)
5473 Lisp_Object string, coding_system, nocopy;
5474 {
5475 return code_convert_string1 (string, coding_system, nocopy, 0);
5476 }
5477
5478 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5479 2, 3, 0,
5480 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5481 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5482 if the encoding operation is trivial.\n\
5483 This function sets `last-coding-system-used' to the precise coding system\n\
5484 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5485 not fully specified.)")
5486 (string, coding_system, nocopy)
5487 Lisp_Object string, coding_system, nocopy;
5488 {
5489 return code_convert_string1 (string, coding_system, nocopy, 1);
5490 }
5491
5492 /* Encode or decode STRING according to CODING_SYSTEM.
5493 Do not set Vlast_coding_system_used.
5494
5495 This function is called only from macros DECODE_FILE and
5496 ENCODE_FILE, thus we ignore character composition. */
5497
5498 Lisp_Object
5499 code_convert_string_norecord (string, coding_system, encodep)
5500 Lisp_Object string, coding_system;
5501 int encodep;
5502 {
5503 struct coding_system coding;
5504
5505 CHECK_STRING (string, 0);
5506 CHECK_SYMBOL (coding_system, 1);
5507
5508 if (NILP (coding_system))
5509 return string;
5510
5511 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5512 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5513
5514 coding.composing = COMPOSITION_DISABLED;
5515 coding.mode |= CODING_MODE_LAST_BLOCK;
5516 return (encodep
5517 ? encode_coding_string (string, &coding, 1)
5518 : decode_coding_string (string, &coding, 1));
5519 }
5520 \f
5521 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5522 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5523 Return the corresponding character.")
5524 (code)
5525 Lisp_Object code;
5526 {
5527 unsigned char c1, c2, s1, s2;
5528 Lisp_Object val;
5529
5530 CHECK_NUMBER (code, 0);
5531 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5532 if (s1 == 0)
5533 {
5534 if (s2 < 0x80)
5535 XSETFASTINT (val, s2);
5536 else if (s2 >= 0xA0 || s2 <= 0xDF)
5537 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5538 else
5539 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5540 }
5541 else
5542 {
5543 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5544 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5545 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5546 DECODE_SJIS (s1, s2, c1, c2);
5547 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5548 }
5549 return val;
5550 }
5551
5552 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5553 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5554 Return the corresponding code in SJIS.")
5555 (ch)
5556 Lisp_Object ch;
5557 {
5558 int charset, c1, c2, s1, s2;
5559 Lisp_Object val;
5560
5561 CHECK_NUMBER (ch, 0);
5562 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5563 if (charset == CHARSET_ASCII)
5564 {
5565 val = ch;
5566 }
5567 else if (charset == charset_jisx0208
5568 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5569 {
5570 ENCODE_SJIS (c1, c2, s1, s2);
5571 XSETFASTINT (val, (s1 << 8) | s2);
5572 }
5573 else if (charset == charset_katakana_jisx0201
5574 && c1 > 0x20 && c2 < 0xE0)
5575 {
5576 XSETFASTINT (val, c1 | 0x80);
5577 }
5578 else
5579 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5580 return val;
5581 }
5582
5583 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5584 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5585 Return the corresponding character.")
5586 (code)
5587 Lisp_Object code;
5588 {
5589 int charset;
5590 unsigned char b1, b2, c1, c2;
5591 Lisp_Object val;
5592
5593 CHECK_NUMBER (code, 0);
5594 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5595 if (b1 == 0)
5596 {
5597 if (b2 >= 0x80)
5598 error ("Invalid BIG5 code: %x", XFASTINT (code));
5599 val = code;
5600 }
5601 else
5602 {
5603 if ((b1 < 0xA1 || b1 > 0xFE)
5604 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5605 error ("Invalid BIG5 code: %x", XFASTINT (code));
5606 DECODE_BIG5 (b1, b2, charset, c1, c2);
5607 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5608 }
5609 return val;
5610 }
5611
5612 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5613 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5614 Return the corresponding character code in Big5.")
5615 (ch)
5616 Lisp_Object ch;
5617 {
5618 int charset, c1, c2, b1, b2;
5619 Lisp_Object val;
5620
5621 CHECK_NUMBER (ch, 0);
5622 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5623 if (charset == CHARSET_ASCII)
5624 {
5625 val = ch;
5626 }
5627 else if ((charset == charset_big5_1
5628 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5629 || (charset == charset_big5_2
5630 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5631 {
5632 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5633 XSETFASTINT (val, (b1 << 8) | b2);
5634 }
5635 else
5636 error ("Can't encode to Big5: %d", XFASTINT (ch));
5637 return val;
5638 }
5639 \f
5640 DEFUN ("set-terminal-coding-system-internal",
5641 Fset_terminal_coding_system_internal,
5642 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5643 (coding_system)
5644 Lisp_Object coding_system;
5645 {
5646 CHECK_SYMBOL (coding_system, 0);
5647 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5648 /* We had better not send unsafe characters to terminal. */
5649 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5650 /* Characer composition should be disabled. */
5651 terminal_coding.composing = COMPOSITION_DISABLED;
5652 terminal_coding.src_multibyte = 1;
5653 terminal_coding.dst_multibyte = 0;
5654 return Qnil;
5655 }
5656
5657 DEFUN ("set-safe-terminal-coding-system-internal",
5658 Fset_safe_terminal_coding_system_internal,
5659 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5660 (coding_system)
5661 Lisp_Object coding_system;
5662 {
5663 CHECK_SYMBOL (coding_system, 0);
5664 setup_coding_system (Fcheck_coding_system (coding_system),
5665 &safe_terminal_coding);
5666 /* Characer composition should be disabled. */
5667 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5668 safe_terminal_coding.src_multibyte = 1;
5669 safe_terminal_coding.dst_multibyte = 0;
5670 return Qnil;
5671 }
5672
5673 DEFUN ("terminal-coding-system",
5674 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5675 "Return coding system specified for terminal output.")
5676 ()
5677 {
5678 return terminal_coding.symbol;
5679 }
5680
5681 DEFUN ("set-keyboard-coding-system-internal",
5682 Fset_keyboard_coding_system_internal,
5683 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5684 (coding_system)
5685 Lisp_Object coding_system;
5686 {
5687 CHECK_SYMBOL (coding_system, 0);
5688 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5689 /* Characer composition should be disabled. */
5690 keyboard_coding.composing = COMPOSITION_DISABLED;
5691 return Qnil;
5692 }
5693
5694 DEFUN ("keyboard-coding-system",
5695 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5696 "Return coding system specified for decoding keyboard input.")
5697 ()
5698 {
5699 return keyboard_coding.symbol;
5700 }
5701
5702 \f
5703 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5704 Sfind_operation_coding_system, 1, MANY, 0,
5705 "Choose a coding system for an operation based on the target name.\n\
5706 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5707 DECODING-SYSTEM is the coding system to use for decoding\n\
5708 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5709 for encoding (in case OPERATION does encoding).\n\
5710 \n\
5711 The first argument OPERATION specifies an I/O primitive:\n\
5712 For file I/O, `insert-file-contents' or `write-region'.\n\
5713 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5714 For network I/O, `open-network-stream'.\n\
5715 \n\
5716 The remaining arguments should be the same arguments that were passed\n\
5717 to the primitive. Depending on which primitive, one of those arguments\n\
5718 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5719 whichever argument specifies the file name is TARGET.\n\
5720 \n\
5721 TARGET has a meaning which depends on OPERATION:\n\
5722 For file I/O, TARGET is a file name.\n\
5723 For process I/O, TARGET is a process name.\n\
5724 For network I/O, TARGET is a service name or a port number\n\
5725 \n\
5726 This function looks up what specified for TARGET in,\n\
5727 `file-coding-system-alist', `process-coding-system-alist',\n\
5728 or `network-coding-system-alist' depending on OPERATION.\n\
5729 They may specify a coding system, a cons of coding systems,\n\
5730 or a function symbol to call.\n\
5731 In the last case, we call the function with one argument,\n\
5732 which is a list of all the arguments given to this function.")
5733 (nargs, args)
5734 int nargs;
5735 Lisp_Object *args;
5736 {
5737 Lisp_Object operation, target_idx, target, val;
5738 register Lisp_Object chain;
5739
5740 if (nargs < 2)
5741 error ("Too few arguments");
5742 operation = args[0];
5743 if (!SYMBOLP (operation)
5744 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5745 error ("Invalid first arguement");
5746 if (nargs < 1 + XINT (target_idx))
5747 error ("Too few arguments for operation: %s",
5748 XSYMBOL (operation)->name->data);
5749 target = args[XINT (target_idx) + 1];
5750 if (!(STRINGP (target)
5751 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5752 error ("Invalid %dth argument", XINT (target_idx) + 1);
5753
5754 chain = ((EQ (operation, Qinsert_file_contents)
5755 || EQ (operation, Qwrite_region))
5756 ? Vfile_coding_system_alist
5757 : (EQ (operation, Qopen_network_stream)
5758 ? Vnetwork_coding_system_alist
5759 : Vprocess_coding_system_alist));
5760 if (NILP (chain))
5761 return Qnil;
5762
5763 for (; CONSP (chain); chain = XCDR (chain))
5764 {
5765 Lisp_Object elt;
5766 elt = XCAR (chain);
5767
5768 if (CONSP (elt)
5769 && ((STRINGP (target)
5770 && STRINGP (XCAR (elt))
5771 && fast_string_match (XCAR (elt), target) >= 0)
5772 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5773 {
5774 val = XCDR (elt);
5775 /* Here, if VAL is both a valid coding system and a valid
5776 function symbol, we return VAL as a coding system. */
5777 if (CONSP (val))
5778 return val;
5779 if (! SYMBOLP (val))
5780 return Qnil;
5781 if (! NILP (Fcoding_system_p (val)))
5782 return Fcons (val, val);
5783 if (! NILP (Ffboundp (val)))
5784 {
5785 val = call1 (val, Flist (nargs, args));
5786 if (CONSP (val))
5787 return val;
5788 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5789 return Fcons (val, val);
5790 }
5791 return Qnil;
5792 }
5793 }
5794 return Qnil;
5795 }
5796
5797 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5798 Supdate_coding_systems_internal, 0, 0, 0,
5799 "Update internal database for ISO2022 and CCL based coding systems.\n\
5800 When values of any coding categories are changed, you must\n\
5801 call this function")
5802 ()
5803 {
5804 int i;
5805
5806 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5807 {
5808 Lisp_Object val;
5809
5810 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5811 if (!NILP (val))
5812 {
5813 if (! coding_system_table[i])
5814 coding_system_table[i] = ((struct coding_system *)
5815 xmalloc (sizeof (struct coding_system)));
5816 setup_coding_system (val, coding_system_table[i]);
5817 }
5818 else if (coding_system_table[i])
5819 {
5820 xfree (coding_system_table[i]);
5821 coding_system_table[i] = NULL;
5822 }
5823 }
5824
5825 return Qnil;
5826 }
5827
5828 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5829 Sset_coding_priority_internal, 0, 0, 0,
5830 "Update internal database for the current value of `coding-category-list'.\n\
5831 This function is internal use only.")
5832 ()
5833 {
5834 int i = 0, idx;
5835 Lisp_Object val;
5836
5837 val = Vcoding_category_list;
5838
5839 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5840 {
5841 if (! SYMBOLP (XCAR (val)))
5842 break;
5843 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5844 if (idx >= CODING_CATEGORY_IDX_MAX)
5845 break;
5846 coding_priorities[i++] = (1 << idx);
5847 val = XCDR (val);
5848 }
5849 /* If coding-category-list is valid and contains all coding
5850 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5851 the following code saves Emacs from crashing. */
5852 while (i < CODING_CATEGORY_IDX_MAX)
5853 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5854
5855 return Qnil;
5856 }
5857
5858 #endif /* emacs */
5859
5860 \f
5861 /*** 9. Post-amble ***/
5862
5863 void
5864 init_coding ()
5865 {
5866 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5867 }
5868
5869 void
5870 init_coding_once ()
5871 {
5872 int i;
5873
5874 /* Emacs' internal format specific initialize routine. */
5875 for (i = 0; i <= 0x20; i++)
5876 emacs_code_class[i] = EMACS_control_code;
5877 emacs_code_class[0x0A] = EMACS_linefeed_code;
5878 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5879 for (i = 0x21 ; i < 0x7F; i++)
5880 emacs_code_class[i] = EMACS_ascii_code;
5881 emacs_code_class[0x7F] = EMACS_control_code;
5882 for (i = 0x80; i < 0xFF; i++)
5883 emacs_code_class[i] = EMACS_invalid_code;
5884 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5885 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5886 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5887 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5888
5889 /* ISO2022 specific initialize routine. */
5890 for (i = 0; i < 0x20; i++)
5891 iso_code_class[i] = ISO_control_0;
5892 for (i = 0x21; i < 0x7F; i++)
5893 iso_code_class[i] = ISO_graphic_plane_0;
5894 for (i = 0x80; i < 0xA0; i++)
5895 iso_code_class[i] = ISO_control_1;
5896 for (i = 0xA1; i < 0xFF; i++)
5897 iso_code_class[i] = ISO_graphic_plane_1;
5898 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5899 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5900 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5901 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5902 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5903 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5904 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5905 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5906 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5907 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5908
5909 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5910
5911 setup_coding_system (Qnil, &keyboard_coding);
5912 setup_coding_system (Qnil, &terminal_coding);
5913 setup_coding_system (Qnil, &safe_terminal_coding);
5914 setup_coding_system (Qnil, &default_buffer_file_coding);
5915
5916 bzero (coding_system_table, sizeof coding_system_table);
5917
5918 bzero (ascii_skip_code, sizeof ascii_skip_code);
5919 for (i = 0; i < 128; i++)
5920 ascii_skip_code[i] = 1;
5921
5922 #if defined (MSDOS) || defined (WINDOWSNT)
5923 system_eol_type = CODING_EOL_CRLF;
5924 #else
5925 system_eol_type = CODING_EOL_LF;
5926 #endif
5927
5928 inhibit_pre_post_conversion = 0;
5929 }
5930
5931 #ifdef emacs
5932
5933 void
5934 syms_of_coding ()
5935 {
5936 Qtarget_idx = intern ("target-idx");
5937 staticpro (&Qtarget_idx);
5938
5939 Qcoding_system_history = intern ("coding-system-history");
5940 staticpro (&Qcoding_system_history);
5941 Fset (Qcoding_system_history, Qnil);
5942
5943 /* Target FILENAME is the first argument. */
5944 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5945 /* Target FILENAME is the third argument. */
5946 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5947
5948 Qcall_process = intern ("call-process");
5949 staticpro (&Qcall_process);
5950 /* Target PROGRAM is the first argument. */
5951 Fput (Qcall_process, Qtarget_idx, make_number (0));
5952
5953 Qcall_process_region = intern ("call-process-region");
5954 staticpro (&Qcall_process_region);
5955 /* Target PROGRAM is the third argument. */
5956 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5957
5958 Qstart_process = intern ("start-process");
5959 staticpro (&Qstart_process);
5960 /* Target PROGRAM is the third argument. */
5961 Fput (Qstart_process, Qtarget_idx, make_number (2));
5962
5963 Qopen_network_stream = intern ("open-network-stream");
5964 staticpro (&Qopen_network_stream);
5965 /* Target SERVICE is the fourth argument. */
5966 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5967
5968 Qcoding_system = intern ("coding-system");
5969 staticpro (&Qcoding_system);
5970
5971 Qeol_type = intern ("eol-type");
5972 staticpro (&Qeol_type);
5973
5974 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5975 staticpro (&Qbuffer_file_coding_system);
5976
5977 Qpost_read_conversion = intern ("post-read-conversion");
5978 staticpro (&Qpost_read_conversion);
5979
5980 Qpre_write_conversion = intern ("pre-write-conversion");
5981 staticpro (&Qpre_write_conversion);
5982
5983 Qno_conversion = intern ("no-conversion");
5984 staticpro (&Qno_conversion);
5985
5986 Qundecided = intern ("undecided");
5987 staticpro (&Qundecided);
5988
5989 Qcoding_system_p = intern ("coding-system-p");
5990 staticpro (&Qcoding_system_p);
5991
5992 Qcoding_system_error = intern ("coding-system-error");
5993 staticpro (&Qcoding_system_error);
5994
5995 Fput (Qcoding_system_error, Qerror_conditions,
5996 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5997 Fput (Qcoding_system_error, Qerror_message,
5998 build_string ("Invalid coding system"));
5999
6000 Qcoding_category = intern ("coding-category");
6001 staticpro (&Qcoding_category);
6002 Qcoding_category_index = intern ("coding-category-index");
6003 staticpro (&Qcoding_category_index);
6004
6005 Vcoding_category_table
6006 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6007 staticpro (&Vcoding_category_table);
6008 {
6009 int i;
6010 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6011 {
6012 XVECTOR (Vcoding_category_table)->contents[i]
6013 = intern (coding_category_name[i]);
6014 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6015 Qcoding_category_index, make_number (i));
6016 }
6017 }
6018
6019 Qtranslation_table = intern ("translation-table");
6020 staticpro (&Qtranslation_table);
6021 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6022
6023 Qtranslation_table_id = intern ("translation-table-id");
6024 staticpro (&Qtranslation_table_id);
6025
6026 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6027 staticpro (&Qtranslation_table_for_decode);
6028
6029 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6030 staticpro (&Qtranslation_table_for_encode);
6031
6032 Qsafe_charsets = intern ("safe-charsets");
6033 staticpro (&Qsafe_charsets);
6034
6035 Qvalid_codes = intern ("valid-codes");
6036 staticpro (&Qvalid_codes);
6037
6038 Qemacs_mule = intern ("emacs-mule");
6039 staticpro (&Qemacs_mule);
6040
6041 Qraw_text = intern ("raw-text");
6042 staticpro (&Qraw_text);
6043
6044 defsubr (&Scoding_system_p);
6045 defsubr (&Sread_coding_system);
6046 defsubr (&Sread_non_nil_coding_system);
6047 defsubr (&Scheck_coding_system);
6048 defsubr (&Sdetect_coding_region);
6049 defsubr (&Sdetect_coding_string);
6050 defsubr (&Sdecode_coding_region);
6051 defsubr (&Sencode_coding_region);
6052 defsubr (&Sdecode_coding_string);
6053 defsubr (&Sencode_coding_string);
6054 defsubr (&Sdecode_sjis_char);
6055 defsubr (&Sencode_sjis_char);
6056 defsubr (&Sdecode_big5_char);
6057 defsubr (&Sencode_big5_char);
6058 defsubr (&Sset_terminal_coding_system_internal);
6059 defsubr (&Sset_safe_terminal_coding_system_internal);
6060 defsubr (&Sterminal_coding_system);
6061 defsubr (&Sset_keyboard_coding_system_internal);
6062 defsubr (&Skeyboard_coding_system);
6063 defsubr (&Sfind_operation_coding_system);
6064 defsubr (&Supdate_coding_systems_internal);
6065 defsubr (&Sset_coding_priority_internal);
6066
6067 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6068 "List of coding systems.\n\
6069 \n\
6070 Do not alter the value of this variable manually. This variable should be\n\
6071 updated by the functions `make-coding-system' and\n\
6072 `define-coding-system-alias'.");
6073 Vcoding_system_list = Qnil;
6074
6075 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6076 "Alist of coding system names.\n\
6077 Each element is one element list of coding system name.\n\
6078 This variable is given to `completing-read' as TABLE argument.\n\
6079 \n\
6080 Do not alter the value of this variable manually. This variable should be\n\
6081 updated by the functions `make-coding-system' and\n\
6082 `define-coding-system-alias'.");
6083 Vcoding_system_alist = Qnil;
6084
6085 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6086 "List of coding-categories (symbols) ordered by priority.");
6087 {
6088 int i;
6089
6090 Vcoding_category_list = Qnil;
6091 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6092 Vcoding_category_list
6093 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6094 Vcoding_category_list);
6095 }
6096
6097 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6098 "Specify the coding system for read operations.\n\
6099 It is useful to bind this variable with `let', but do not set it globally.\n\
6100 If the value is a coding system, it is used for decoding on read operation.\n\
6101 If not, an appropriate element is used from one of the coding system alists:\n\
6102 There are three such tables, `file-coding-system-alist',\n\
6103 `process-coding-system-alist', and `network-coding-system-alist'.");
6104 Vcoding_system_for_read = Qnil;
6105
6106 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6107 "Specify the coding system for write operations.\n\
6108 Programs bind this variable with `let', but you should not set it globally.\n\
6109 If the value is a coding system, it is used for encoding of output,\n\
6110 when writing it to a file and when sending it to a file or subprocess.\n\
6111 \n\
6112 If this does not specify a coding system, an appropriate element\n\
6113 is used from one of the coding system alists:\n\
6114 There are three such tables, `file-coding-system-alist',\n\
6115 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6116 For output to files, if the above procedure does not specify a coding system,\n\
6117 the value of `buffer-file-coding-system' is used.");
6118 Vcoding_system_for_write = Qnil;
6119
6120 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6121 "Coding system used in the latest file or process I/O.");
6122 Vlast_coding_system_used = Qnil;
6123
6124 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6125 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6126 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6127 such conversion.");
6128 inhibit_eol_conversion = 0;
6129
6130 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6131 "Non-nil means process buffer inherits coding system of process output.\n\
6132 Bind it to t if the process output is to be treated as if it were a file\n\
6133 read from some filesystem.");
6134 inherit_process_coding_system = 0;
6135
6136 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6137 "Alist to decide a coding system to use for a file I/O operation.\n\
6138 The format is ((PATTERN . VAL) ...),\n\
6139 where PATTERN is a regular expression matching a file name,\n\
6140 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6141 If VAL is a coding system, it is used for both decoding and encoding\n\
6142 the file contents.\n\
6143 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6144 and the cdr part is used for encoding.\n\
6145 If VAL is a function symbol, the function must return a coding system\n\
6146 or a cons of coding systems which are used as above.\n\
6147 \n\
6148 See also the function `find-operation-coding-system'\n\
6149 and the variable `auto-coding-alist'.");
6150 Vfile_coding_system_alist = Qnil;
6151
6152 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6153 "Alist to decide a coding system to use for a process I/O operation.\n\
6154 The format is ((PATTERN . VAL) ...),\n\
6155 where PATTERN is a regular expression matching a program name,\n\
6156 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6157 If VAL is a coding system, it is used for both decoding what received\n\
6158 from the program and encoding what sent to the program.\n\
6159 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6160 and the cdr part is used for encoding.\n\
6161 If VAL is a function symbol, the function must return a coding system\n\
6162 or a cons of coding systems which are used as above.\n\
6163 \n\
6164 See also the function `find-operation-coding-system'.");
6165 Vprocess_coding_system_alist = Qnil;
6166
6167 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6168 "Alist to decide a coding system to use for a network I/O operation.\n\
6169 The format is ((PATTERN . VAL) ...),\n\
6170 where PATTERN is a regular expression matching a network service name\n\
6171 or is a port number to connect to,\n\
6172 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6173 If VAL is a coding system, it is used for both decoding what received\n\
6174 from the network stream and encoding what sent to the network stream.\n\
6175 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6176 and the cdr part is used for encoding.\n\
6177 If VAL is a function symbol, the function must return a coding system\n\
6178 or a cons of coding systems which are used as above.\n\
6179 \n\
6180 See also the function `find-operation-coding-system'.");
6181 Vnetwork_coding_system_alist = Qnil;
6182
6183 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6184 "Coding system to use with system messages.");
6185 Vlocale_coding_system = Qnil;
6186
6187 /* The eol mnemonics are reset in startup.el system-dependently. */
6188 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6189 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6190 eol_mnemonic_unix = build_string (":");
6191
6192 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6193 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6194 eol_mnemonic_dos = build_string ("\\");
6195
6196 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6197 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6198 eol_mnemonic_mac = build_string ("/");
6199
6200 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6201 "*String displayed in mode line when end-of-line format is not yet determined.");
6202 eol_mnemonic_undecided = build_string (":");
6203
6204 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6205 "*Non-nil enables character translation while encoding and decoding.");
6206 Venable_character_translation = Qt;
6207
6208 DEFVAR_LISP ("standard-translation-table-for-decode",
6209 &Vstandard_translation_table_for_decode,
6210 "Table for translating characters while decoding.");
6211 Vstandard_translation_table_for_decode = Qnil;
6212
6213 DEFVAR_LISP ("standard-translation-table-for-encode",
6214 &Vstandard_translation_table_for_encode,
6215 "Table for translationg characters while encoding.");
6216 Vstandard_translation_table_for_encode = Qnil;
6217
6218 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6219 "Alist of charsets vs revision numbers.\n\
6220 While encoding, if a charset (car part of an element) is found,\n\
6221 designate it with the escape sequence identifing revision (cdr part of the element).");
6222 Vcharset_revision_alist = Qnil;
6223
6224 DEFVAR_LISP ("default-process-coding-system",
6225 &Vdefault_process_coding_system,
6226 "Cons of coding systems used for process I/O by default.\n\
6227 The car part is used for decoding a process output,\n\
6228 the cdr part is used for encoding a text to be sent to a process.");
6229 Vdefault_process_coding_system = Qnil;
6230
6231 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6232 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6233 This is a vector of length 256.\n\
6234 If Nth element is non-nil, the existence of code N in a file\n\
6235 \(or output of subprocess) doesn't prevent it to be detected as\n\
6236 a coding system of ISO 2022 variant which has a flag\n\
6237 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6238 or reading output of a subprocess.\n\
6239 Only 128th through 159th elements has a meaning.");
6240 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6241
6242 DEFVAR_LISP ("select-safe-coding-system-function",
6243 &Vselect_safe_coding_system_function,
6244 "Function to call to select safe coding system for encoding a text.\n\
6245 \n\
6246 If set, this function is called to force a user to select a proper\n\
6247 coding system which can encode the text in the case that a default\n\
6248 coding system used in each operation can't encode the text.\n\
6249 \n\
6250 The default value is `select-safe-coding-system' (which see).");
6251 Vselect_safe_coding_system_function = Qnil;
6252
6253 }
6254
6255 char *
6256 emacs_strerror (error_number)
6257 int error_number;
6258 {
6259 char *str;
6260
6261 synchronize_system_messages_locale ();
6262 str = strerror (error_number);
6263
6264 if (! NILP (Vlocale_coding_system))
6265 {
6266 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6267 Vlocale_coding_system,
6268 0);
6269 str = (char *) XSTRING (dec)->data;
6270 }
6271
6272 return str;
6273 }
6274
6275 #endif /* emacs */
6276