]> code.delx.au - gnu-emacs/blob - src/coding.c
(code_convert_region): Even if the length of text is
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 0. General comments
25 1. Preamble
26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. CCL handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
33 9. Post-amble
34
35 */
36
37 /*** 0. General comments ***/
38
39
40 /*** GENERAL NOTE on CODING SYSTEM ***
41
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
47 coding system.
48
49 0. Emacs' internal format (emacs-mule)
50
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
53
54 1. ISO2022
55
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
60
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
62
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
65 section 4.
66
67 3. BIG5
68
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
74
75 4. Raw text
76
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
79
80 5. Other
81
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
86
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
91
92 */
93
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
95
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
100 `carriage-return'.
101
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
106
107 */
108
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
110
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
116 #if 0
117 int
118 detect_coding_emacs_mule (src, src_end)
119 unsigned char *src, *src_end;
120 {
121 ...
122 }
123 #endif
124
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
126
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
131
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
136
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
140
141 Below is a template of these functions. */
142 #if 0
143 static void
144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 {
149 ...
150 }
151 #endif
152
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
154
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
159
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
164
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
168
169 Below is a template of these functions. */
170 #if 0
171 static void
172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
173 struct coding_system *coding;
174 unsigned char *source, *destination;
175 int src_bytes, dst_bytes;
176 {
177 ...
178 }
179 #endif
180
181 /*** COMMONLY USED MACROS ***/
182
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
190
191 #define ONE_MORE_BYTE(c1) \
192 do { \
193 if (src >= src_end) \
194 { \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
197 } \
198 c1 = *src++; \
199 } while (0)
200
201 #define TWO_MORE_BYTES(c1, c2) \
202 do { \
203 if (src + 1 >= src_end) \
204 { \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
207 } \
208 c1 = *src++; \
209 c2 = *src++; \
210 } while (0)
211
212
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
222
223 #define ONE_MORE_CHAR(c) \
224 do { \
225 int len = src_end - src; \
226 int bytes; \
227 if (len <= 0) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
231 } \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
235 else \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
239 src += bytes; \
240 } while (0)
241
242
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
245
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
249 `dst'.
250
251 This macro is used in decoding routines. */
252
253 #define EMIT_CHAR(c) \
254 do { \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
258 { \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
261 { \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
264 } \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
267 } \
268 \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
271 { \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
275 } \
276 } while (0)
277
278
279 #define EMIT_ONE_BYTE(c) \
280 do { \
281 if (dst >= (dst_bytes ? dst_end : src)) \
282 { \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
285 } \
286 *dst++ = c; \
287 } while (0)
288
289 #define EMIT_TWO_BYTES(c1, c2) \
290 do { \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
292 { \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
295 } \
296 *dst++ = c1, *dst++ = c2; \
297 } while (0)
298
299 #define EMIT_BYTES(from, to) \
300 do { \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 while (from < to) \
307 *dst++ = *from++; \
308 } while (0)
309
310 \f
311 /*** 1. Preamble ***/
312
313 #ifdef emacs
314 #include <config.h>
315 #endif
316
317 #include <stdio.h>
318
319 #ifdef emacs
320
321 #include "lisp.h"
322 #include "buffer.h"
323 #include "charset.h"
324 #include "composite.h"
325 #include "ccl.h"
326 #include "coding.h"
327 #include "window.h"
328
329 #else /* not emacs */
330
331 #include "mulelib.h"
332
333 #endif /* not emacs */
334
335 Lisp_Object Qcoding_system, Qeol_type;
336 Lisp_Object Qbuffer_file_coding_system;
337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
338 Lisp_Object Qno_conversion, Qundecided;
339 Lisp_Object Qcoding_system_history;
340 Lisp_Object Qsafe_charsets;
341 Lisp_Object Qvalid_codes;
342
343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
345 Lisp_Object Qstart_process, Qopen_network_stream;
346 Lisp_Object Qtarget_idx;
347
348 Lisp_Object Vselect_safe_coding_system_function;
349
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
352 /* Mnemonic string to indicate format of end-of-line is not yet
353 decided. */
354 Lisp_Object eol_mnemonic_undecided;
355
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
358 int system_eol_type;
359
360 #ifdef emacs
361
362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
363
364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
365
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule, Qraw_text;
369
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used;
378
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table;
382
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion;
385
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
388
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding;
391
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding;
395
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding;
398
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding;
401
402 Lisp_Object Vfile_coding_system_alist;
403 Lisp_Object Vprocess_coding_system_alist;
404 Lisp_Object Vnetwork_coding_system_alist;
405
406 Lisp_Object Vlocale_coding_system;
407
408 #endif /* emacs */
409
410 Lisp_Object Qcoding_category, Qcoding_category_index;
411
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list;
414
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table;
417
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
435 };
436
437 /* Table of pointers to coding systems corresponding to each coding
438 categories. */
439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
440
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
443 static
444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
445
446 /* Flag to tell if we look up translation table on character code
447 conversion. */
448 Lisp_Object Venable_character_translation;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode;
453
454 Lisp_Object Qtranslation_table;
455 Lisp_Object Qtranslation_table_id;
456 Lisp_Object Qtranslation_table_for_decode;
457 Lisp_Object Qtranslation_table_for_encode;
458
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist;
461
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system;
464
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion;
470
471 \f
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
473
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
477
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
481
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
484 code + 0x20).
485
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
488
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
495 and position-code.
496
497 --- CODE RANGE of Emacs' internal format ---
498 character set range
499 ------------- -----
500 ascii 0x00..0x7F
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
505
506 */
507
508 enum emacs_code_class_type emacs_code_class[256];
509
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
513
514 int
515 detect_coding_emacs_mule (src, src_end)
516 unsigned char *src, *src_end;
517 {
518 unsigned char c;
519 int composing = 0;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding;
522 struct coding_system *coding = &dummy_coding;
523
524 while (1)
525 {
526 ONE_MORE_BYTE (c);
527
528 if (composing)
529 {
530 if (c < 0xA0)
531 composing = 0;
532 else if (c == 0xA0)
533 {
534 ONE_MORE_BYTE (c);
535 c &= 0x7F;
536 }
537 else
538 c -= 0x20;
539 }
540
541 if (c < 0x20)
542 {
543 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
544 return 0;
545 }
546 else if (c >= 0x80 && c < 0xA0)
547 {
548 if (c == 0x80)
549 /* Old leading code for a composite character. */
550 composing = 1;
551 else
552 {
553 unsigned char *src_base = src - 1;
554 int bytes;
555
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
557 bytes))
558 return 0;
559 src = src_base + bytes;
560 }
561 }
562 }
563 label_end_of_loop:
564 return CODING_CATEGORY_MASK_EMACS_MULE;
565 }
566
567
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
569
570 static void
571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
572 struct coding_system *coding;
573 unsigned char *source, *destination;
574 int src_bytes, dst_bytes;
575 {
576 unsigned char *src = source;
577 unsigned char *src_end = source + src_bytes;
578 unsigned char *dst = destination;
579 unsigned char *dst_end = destination + dst_bytes;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
583 character. */
584 unsigned char *src_base;
585
586 coding->produced_char = 0;
587 while ((src_base = src) < src_end)
588 {
589 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
590 int bytes;
591
592 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
593 {
594 p = src;
595 src += bytes;
596 }
597 else
598 {
599 bytes = CHAR_STRING (*src, tmp);
600 p = tmp;
601 src++;
602 }
603 if (dst + bytes >= (dst_bytes ? dst_end : src))
604 {
605 coding->result = CODING_FINISH_INSUFFICIENT_DST;
606 break;
607 }
608 while (bytes--) *dst++ = *p++;
609 coding->produced_char++;
610 }
611 coding->consumed = coding->consumed_char = src_base - source;
612 coding->produced = dst - destination;
613 }
614
615 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
616 encode_eol (coding, source, destination, src_bytes, dst_bytes)
617
618
619 \f
620 /*** 3. ISO2022 handlers ***/
621
622 /* The following note describes the coding system ISO2022 briefly.
623 Since the intention of this note is to help understand the
624 functions in this file, some parts are NOT ACCURATE or OVERLY
625 SIMPLIFIED. For thorough understanding, please refer to the
626 original document of ISO2022.
627
628 ISO2022 provides many mechanisms to encode several character sets
629 in 7-bit and 8-bit environments. For 7-bite environments, all text
630 is encoded using bytes less than 128. This may make the encoded
631 text a little bit longer, but the text passes more easily through
632 several gateways, some of which strip off MSB (Most Signigant Bit).
633
634 There are two kinds of character sets: control character set and
635 graphic character set. The former contains control characters such
636 as `newline' and `escape' to provide control functions (control
637 functions are also provided by escape sequences). The latter
638 contains graphic characters such as 'A' and '-'. Emacs recognizes
639 two control character sets and many graphic character sets.
640
641 Graphic character sets are classified into one of the following
642 four classes, according to the number of bytes (DIMENSION) and
643 number of characters in one dimension (CHARS) of the set:
644 - DIMENSION1_CHARS94
645 - DIMENSION1_CHARS96
646 - DIMENSION2_CHARS94
647 - DIMENSION2_CHARS96
648
649 In addition, each character set is assigned an identification tag,
650 unique for each set, called "final character" (denoted as <F>
651 hereafter). The <F> of each character set is decided by ECMA(*)
652 when it is registered in ISO. The code range of <F> is 0x30..0x7F
653 (0x30..0x3F are for private use only).
654
655 Note (*): ECMA = European Computer Manufacturers Association
656
657 Here are examples of graphic character set [NAME(<F>)]:
658 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
659 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
660 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
661 o DIMENSION2_CHARS96 -- none for the moment
662
663 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
664 C0 [0x00..0x1F] -- control character plane 0
665 GL [0x20..0x7F] -- graphic character plane 0
666 C1 [0x80..0x9F] -- control character plane 1
667 GR [0xA0..0xFF] -- graphic character plane 1
668
669 A control character set is directly designated and invoked to C0 or
670 C1 by an escape sequence. The most common case is that:
671 - ISO646's control character set is designated/invoked to C0, and
672 - ISO6429's control character set is designated/invoked to C1,
673 and usually these designations/invocations are omitted in encoded
674 text. In a 7-bit environment, only C0 can be used, and a control
675 character for C1 is encoded by an appropriate escape sequence to
676 fit into the environment. All control characters for C1 are
677 defined to have corresponding escape sequences.
678
679 A graphic character set is at first designated to one of four
680 graphic registers (G0 through G3), then these graphic registers are
681 invoked to GL or GR. These designations and invocations can be
682 done independently. The most common case is that G0 is invoked to
683 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
684 these invocations and designations are omitted in encoded text.
685 In a 7-bit environment, only GL can be used.
686
687 When a graphic character set of CHARS94 is invoked to GL, codes
688 0x20 and 0x7F of the GL area work as control characters SPACE and
689 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
690 be used.
691
692 There are two ways of invocation: locking-shift and single-shift.
693 With locking-shift, the invocation lasts until the next different
694 invocation, whereas with single-shift, the invocation affects the
695 following character only and doesn't affect the locking-shift
696 state. Invocations are done by the following control characters or
697 escape sequences:
698
699 ----------------------------------------------------------------------
700 abbrev function cntrl escape seq description
701 ----------------------------------------------------------------------
702 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
703 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
704 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
705 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
706 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
707 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
708 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
709 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
710 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
711 ----------------------------------------------------------------------
712 (*) These are not used by any known coding system.
713
714 Control characters for these functions are defined by macros
715 ISO_CODE_XXX in `coding.h'.
716
717 Designations are done by the following escape sequences:
718 ----------------------------------------------------------------------
719 escape sequence description
720 ----------------------------------------------------------------------
721 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
722 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
723 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
724 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
725 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
726 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
727 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
728 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
729 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
730 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
731 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
732 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
733 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
734 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
735 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
736 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
737 ----------------------------------------------------------------------
738
739 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
740 of dimension 1, chars 94, and final character <F>, etc...
741
742 Note (*): Although these designations are not allowed in ISO2022,
743 Emacs accepts them on decoding, and produces them on encoding
744 CHARS96 character sets in a coding system which is characterized as
745 7-bit environment, non-locking-shift, and non-single-shift.
746
747 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
748 '(' can be omitted. We refer to this as "short-form" hereafter.
749
750 Now you may notice that there are a lot of ways for encoding the
751 same multilingual text in ISO2022. Actually, there exist many
752 coding systems such as Compound Text (used in X11's inter client
753 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
754 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
755 localized platforms), and all of these are variants of ISO2022.
756
757 In addition to the above, Emacs handles two more kinds of escape
758 sequences: ISO6429's direction specification and Emacs' private
759 sequence for specifying character composition.
760
761 ISO6429's direction specification takes the following form:
762 o CSI ']' -- end of the current direction
763 o CSI '0' ']' -- end of the current direction
764 o CSI '1' ']' -- start of left-to-right text
765 o CSI '2' ']' -- start of right-to-left text
766 The control character CSI (0x9B: control sequence introducer) is
767 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
768
769 Character composition specification takes the following form:
770 o ESC '0' -- start relative composition
771 o ESC '1' -- end composition
772 o ESC '2' -- start rule-base composition (*)
773 o ESC '3' -- start relative composition with alternate chars (**)
774 o ESC '4' -- start rule-base composition with alternate chars (**)
775 Since these are not standard escape sequences of any ISO standard,
776 the use of them for these meaning is restricted to Emacs only.
777
778 (*) This form is used only in Emacs 20.5 and the older versions,
779 but the newer versions can safely decode it.
780 (**) This form is used only in Emacs 21.1 and the newer versions,
781 and the older versions can't decode it.
782
783 Here's a list of examples usages of these composition escape
784 sequences (categorized by `enum composition_method').
785
786 COMPOSITION_RELATIVE:
787 ESC 0 CHAR [ CHAR ] ESC 1
788 COMPOSITOIN_WITH_RULE:
789 ESC 2 CHAR [ RULE CHAR ] ESC 1
790 COMPOSITION_WITH_ALTCHARS:
791 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
792 COMPOSITION_WITH_RULE_ALTCHARS:
793 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
794
795 enum iso_code_class_type iso_code_class[256];
796
797 #define CHARSET_OK(idx, charset) \
798 (coding_system_table[idx] \
799 && (coding_system_table[idx]->safe_charsets[charset] \
800 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
801 (coding_system_table[idx], charset) \
802 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
803
804 #define SHIFT_OUT_OK(idx) \
805 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
806
807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
808 Check if a text is encoded in ISO2022. If it is, returns an
809 integer in which appropriate flag bits any of:
810 CODING_CATEGORY_MASK_ISO_7
811 CODING_CATEGORY_MASK_ISO_7_TIGHT
812 CODING_CATEGORY_MASK_ISO_8_1
813 CODING_CATEGORY_MASK_ISO_8_2
814 CODING_CATEGORY_MASK_ISO_7_ELSE
815 CODING_CATEGORY_MASK_ISO_8_ELSE
816 are set. If a code which should never appear in ISO2022 is found,
817 returns 0. */
818
819 int
820 detect_coding_iso2022 (src, src_end)
821 unsigned char *src, *src_end;
822 {
823 int mask = CODING_CATEGORY_MASK_ISO;
824 int mask_found = 0;
825 int reg[4], shift_out = 0, single_shifting = 0;
826 int c, c1, i, charset;
827 /* Dummy for ONE_MORE_BYTE. */
828 struct coding_system dummy_coding;
829 struct coding_system *coding = &dummy_coding;
830
831 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
832 while (mask && src < src_end)
833 {
834 ONE_MORE_BYTE (c);
835 switch (c)
836 {
837 case ISO_CODE_ESC:
838 single_shifting = 0;
839 ONE_MORE_BYTE (c);
840 if (c >= '(' && c <= '/')
841 {
842 /* Designation sequence for a charset of dimension 1. */
843 ONE_MORE_BYTE (c1);
844 if (c1 < ' ' || c1 >= 0x80
845 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
846 /* Invalid designation sequence. Just ignore. */
847 break;
848 reg[(c - '(') % 4] = charset;
849 }
850 else if (c == '$')
851 {
852 /* Designation sequence for a charset of dimension 2. */
853 ONE_MORE_BYTE (c);
854 if (c >= '@' && c <= 'B')
855 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
856 reg[0] = charset = iso_charset_table[1][0][c];
857 else if (c >= '(' && c <= '/')
858 {
859 ONE_MORE_BYTE (c1);
860 if (c1 < ' ' || c1 >= 0x80
861 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
862 /* Invalid designation sequence. Just ignore. */
863 break;
864 reg[(c - '(') % 4] = charset;
865 }
866 else
867 /* Invalid designation sequence. Just ignore. */
868 break;
869 }
870 else if (c == 'N' || c == 'O')
871 {
872 /* ESC <Fe> for SS2 or SS3. */
873 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
874 break;
875 }
876 else if (c >= '0' && c <= '4')
877 {
878 /* ESC <Fp> for start/end composition. */
879 mask_found |= CODING_CATEGORY_MASK_ISO;
880 break;
881 }
882 else
883 /* Invalid escape sequence. Just ignore. */
884 break;
885
886 /* We found a valid designation sequence for CHARSET. */
887 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
888 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
889 mask_found |= CODING_CATEGORY_MASK_ISO_7;
890 else
891 mask &= ~CODING_CATEGORY_MASK_ISO_7;
892 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
893 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
894 else
895 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
896 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
897 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
898 else
899 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
900 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
901 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
902 else
903 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
904 break;
905
906 case ISO_CODE_SO:
907 single_shifting = 0;
908 if (shift_out == 0
909 && (reg[1] >= 0
910 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
912 {
913 /* Locking shift out. */
914 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
915 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
916 }
917 break;
918
919 case ISO_CODE_SI:
920 single_shifting = 0;
921 if (shift_out == 1)
922 {
923 /* Locking shift in. */
924 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
925 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
926 }
927 break;
928
929 case ISO_CODE_CSI:
930 single_shifting = 0;
931 case ISO_CODE_SS2:
932 case ISO_CODE_SS3:
933 {
934 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
935
936 if (c != ISO_CODE_CSI)
937 {
938 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
939 & CODING_FLAG_ISO_SINGLE_SHIFT)
940 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
941 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
942 & CODING_FLAG_ISO_SINGLE_SHIFT)
943 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
944 single_shifting = 1;
945 }
946 if (VECTORP (Vlatin_extra_code_table)
947 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
948 {
949 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
950 & CODING_FLAG_ISO_LATIN_EXTRA)
951 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
952 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
953 & CODING_FLAG_ISO_LATIN_EXTRA)
954 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
955 }
956 mask &= newmask;
957 mask_found |= newmask;
958 }
959 break;
960
961 default:
962 if (c < 0x80)
963 {
964 single_shifting = 0;
965 break;
966 }
967 else if (c < 0xA0)
968 {
969 single_shifting = 0;
970 if (VECTORP (Vlatin_extra_code_table)
971 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
972 {
973 int newmask = 0;
974
975 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
976 & CODING_FLAG_ISO_LATIN_EXTRA)
977 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
978 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
979 & CODING_FLAG_ISO_LATIN_EXTRA)
980 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
981 mask &= newmask;
982 mask_found |= newmask;
983 }
984 else
985 return 0;
986 }
987 else
988 {
989 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
990 | CODING_CATEGORY_MASK_ISO_7_ELSE);
991 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
992 /* Check the length of succeeding codes of the range
993 0xA0..0FF. If the byte length is odd, we exclude
994 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
995 when we are not single shifting. */
996 if (!single_shifting
997 && mask & CODING_CATEGORY_MASK_ISO_8_2)
998 {
999 int i = 1;
1000 while (src < src_end)
1001 {
1002 ONE_MORE_BYTE (c);
1003 if (c < 0xA0)
1004 break;
1005 i++;
1006 }
1007
1008 if (i & 1 && src < src_end)
1009 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1010 else
1011 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1012 }
1013 }
1014 break;
1015 }
1016 }
1017 label_end_of_loop:
1018 return (mask & mask_found);
1019 }
1020
1021 /* Decode a character of which charset is CHARSET, the 1st position
1022 code is C1, the 2nd position code is C2, and return the decoded
1023 character code. If the variable `translation_table' is non-nil,
1024 returned the translated code. */
1025
1026 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1027 (NILP (translation_table) \
1028 ? MAKE_CHAR (charset, c1, c2) \
1029 : translate_char (translation_table, -1, charset, c1, c2))
1030
1031 /* Set designation state into CODING. */
1032 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1033 do { \
1034 int charset; \
1035 \
1036 if (final_char < '0' || final_char >= 128) \
1037 goto label_invalid_code; \
1038 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1039 make_number (chars), \
1040 make_number (final_char)); \
1041 if (charset >= 0 \
1042 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1043 || coding->safe_charsets[charset])) \
1044 { \
1045 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1046 && reg == 0 \
1047 && charset == CHARSET_ASCII) \
1048 { \
1049 /* We should insert this designation sequence as is so \
1050 that it is surely written back to a file. */ \
1051 coding->spec.iso2022.last_invalid_designation_register = -1; \
1052 goto label_invalid_code; \
1053 } \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 if ((coding->mode & CODING_MODE_DIRECTION) \
1056 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1057 charset = CHARSET_REVERSE_CHARSET (charset); \
1058 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1059 } \
1060 else \
1061 { \
1062 coding->spec.iso2022.last_invalid_designation_register = reg; \
1063 goto label_invalid_code; \
1064 } \
1065 } while (0)
1066
1067 /* Allocate a memory block for storing information about compositions.
1068 The block is chained to the already allocated blocks. */
1069
1070 void
1071 coding_allocate_composition_data (coding, char_offset)
1072 struct coding_system *coding;
1073 int char_offset;
1074 {
1075 struct composition_data *cmp_data
1076 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1077
1078 cmp_data->char_offset = char_offset;
1079 cmp_data->used = 0;
1080 cmp_data->prev = coding->cmp_data;
1081 cmp_data->next = NULL;
1082 if (coding->cmp_data)
1083 coding->cmp_data->next = cmp_data;
1084 coding->cmp_data = cmp_data;
1085 coding->cmp_data_start = 0;
1086 }
1087
1088 /* Record the starting position START and METHOD of one composition. */
1089
1090 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1091 do { \
1092 struct composition_data *cmp_data = coding->cmp_data; \
1093 int *data = cmp_data->data + cmp_data->used; \
1094 coding->cmp_data_start = cmp_data->used; \
1095 data[0] = -1; \
1096 data[1] = cmp_data->char_offset + start; \
1097 data[3] = (int) method; \
1098 cmp_data->used += 4; \
1099 } while (0)
1100
1101 /* Record the ending position END of the current composition. */
1102
1103 #define CODING_ADD_COMPOSITION_END(coding, end) \
1104 do { \
1105 struct composition_data *cmp_data = coding->cmp_data; \
1106 int *data = cmp_data->data + coding->cmp_data_start; \
1107 data[0] = cmp_data->used - coding->cmp_data_start; \
1108 data[2] = cmp_data->char_offset + end; \
1109 } while (0)
1110
1111 /* Record one COMPONENT (alternate character or composition rule). */
1112
1113 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1114 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1115
1116 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1117
1118 #define DECODE_COMPOSITION_START(c1) \
1119 do { \
1120 if (coding->composing == COMPOSITION_DISABLED) \
1121 { \
1122 *dst++ = ISO_CODE_ESC; \
1123 *dst++ = c1 & 0x7f; \
1124 coding->produced_char += 2; \
1125 } \
1126 else if (!COMPOSING_P (coding)) \
1127 { \
1128 /* This is surely the start of a composition. We must be sure \
1129 that coding->cmp_data has enough space to store the \
1130 information about the composition. If not, terminate the \
1131 current decoding loop, allocate one more memory block for \
1132 coding->cmp_data in the calller, then start the decoding \
1133 loop again. We can't allocate memory here directly because \
1134 it may cause buffer/string relocation. */ \
1135 if (!coding->cmp_data \
1136 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1137 >= COMPOSITION_DATA_SIZE)) \
1138 { \
1139 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1140 goto label_end_of_loop; \
1141 } \
1142 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1143 : c1 == '2' ? COMPOSITION_WITH_RULE \
1144 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1145 : COMPOSITION_WITH_RULE_ALTCHARS); \
1146 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1147 coding->composing); \
1148 coding->composition_rule_follows = 0; \
1149 } \
1150 else \
1151 { \
1152 /* We are already handling a composition. If the method is \
1153 the following two, the codes following the current escape \
1154 sequence are actual characters stored in a buffer. */ \
1155 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1156 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1157 { \
1158 coding->composing = COMPOSITION_RELATIVE; \
1159 coding->composition_rule_follows = 0; \
1160 } \
1161 } \
1162 } while (0)
1163
1164 /* Handle compositoin end sequence ESC 1. */
1165
1166 #define DECODE_COMPOSITION_END(c1) \
1167 do { \
1168 if (coding->composing == COMPOSITION_DISABLED) \
1169 { \
1170 *dst++ = ISO_CODE_ESC; \
1171 *dst++ = c1; \
1172 coding->produced_char += 2; \
1173 } \
1174 else \
1175 { \
1176 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1177 coding->composing = COMPOSITION_NO; \
1178 } \
1179 } while (0)
1180
1181 /* Decode a composition rule from the byte C1 (and maybe one more byte
1182 from SRC) and store one encoded composition rule in
1183 coding->cmp_data. */
1184
1185 #define DECODE_COMPOSITION_RULE(c1) \
1186 do { \
1187 int rule = 0; \
1188 (c1) -= 32; \
1189 if (c1 < 81) /* old format (before ver.21) */ \
1190 { \
1191 int gref = (c1) / 9; \
1192 int nref = (c1) % 9; \
1193 if (gref == 4) gref = 10; \
1194 if (nref == 4) nref = 10; \
1195 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1196 } \
1197 else if (c1 < 93) /* new format (after ver.21) */ \
1198 { \
1199 ONE_MORE_BYTE (c2); \
1200 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1201 } \
1202 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1203 coding->composition_rule_follows = 0; \
1204 } while (0)
1205
1206
1207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1208
1209 static void
1210 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1211 struct coding_system *coding;
1212 unsigned char *source, *destination;
1213 int src_bytes, dst_bytes;
1214 {
1215 unsigned char *src = source;
1216 unsigned char *src_end = source + src_bytes;
1217 unsigned char *dst = destination;
1218 unsigned char *dst_end = destination + dst_bytes;
1219 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1220 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1221 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1222 /* SRC_BASE remembers the start position in source in each loop.
1223 The loop will be exited when there's not enough source code
1224 (within macro ONE_MORE_BYTE), or when there's not enough
1225 destination area to produce a character (within macro
1226 EMIT_CHAR). */
1227 unsigned char *src_base;
1228 int c, charset;
1229 Lisp_Object translation_table;
1230
1231 if (NILP (Venable_character_translation))
1232 translation_table = Qnil;
1233 else
1234 {
1235 translation_table = coding->translation_table_for_decode;
1236 if (NILP (translation_table))
1237 translation_table = Vstandard_translation_table_for_decode;
1238 }
1239
1240 coding->result = CODING_FINISH_NORMAL;
1241
1242 while (1)
1243 {
1244 int c1, c2;
1245
1246 src_base = src;
1247 ONE_MORE_BYTE (c1);
1248
1249 /* We produce no character or one character. */
1250 switch (iso_code_class [c1])
1251 {
1252 case ISO_0x20_or_0x7F:
1253 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1254 {
1255 DECODE_COMPOSITION_RULE (c1);
1256 continue;
1257 }
1258 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1259 {
1260 /* This is SPACE or DEL. */
1261 charset = CHARSET_ASCII;
1262 break;
1263 }
1264 /* This is a graphic character, we fall down ... */
1265
1266 case ISO_graphic_plane_0:
1267 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1268 {
1269 DECODE_COMPOSITION_RULE (c1);
1270 continue;
1271 }
1272 charset = charset0;
1273 break;
1274
1275 case ISO_0xA0_or_0xFF:
1276 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1277 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1278 goto label_invalid_code;
1279 /* This is a graphic character, we fall down ... */
1280
1281 case ISO_graphic_plane_1:
1282 if (charset1 < 0)
1283 goto label_invalid_code;
1284 charset = charset1;
1285 break;
1286
1287 case ISO_control_0:
1288 if (COMPOSING_P (coding))
1289 DECODE_COMPOSITION_END ('1');
1290
1291 /* All ISO2022 control characters in this class have the
1292 same representation in Emacs internal format. */
1293 if (c1 == '\n'
1294 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1295 && (coding->eol_type == CODING_EOL_CR
1296 || coding->eol_type == CODING_EOL_CRLF))
1297 {
1298 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1299 goto label_end_of_loop;
1300 }
1301 charset = CHARSET_ASCII;
1302 break;
1303
1304 case ISO_control_1:
1305 if (COMPOSING_P (coding))
1306 DECODE_COMPOSITION_END ('1');
1307 goto label_invalid_code;
1308
1309 case ISO_carriage_return:
1310 if (COMPOSING_P (coding))
1311 DECODE_COMPOSITION_END ('1');
1312
1313 if (coding->eol_type == CODING_EOL_CR)
1314 c1 = '\n';
1315 else if (coding->eol_type == CODING_EOL_CRLF)
1316 {
1317 ONE_MORE_BYTE (c1);
1318 if (c1 != ISO_CODE_LF)
1319 {
1320 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1321 {
1322 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1323 goto label_end_of_loop;
1324 }
1325 src--;
1326 c1 = '\r';
1327 }
1328 }
1329 charset = CHARSET_ASCII;
1330 break;
1331
1332 case ISO_shift_out:
1333 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1334 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1335 goto label_invalid_code;
1336 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1337 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1338 continue;
1339
1340 case ISO_shift_in:
1341 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1342 goto label_invalid_code;
1343 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1344 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1345 continue;
1346
1347 case ISO_single_shift_2_7:
1348 case ISO_single_shift_2:
1349 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1350 goto label_invalid_code;
1351 /* SS2 is handled as an escape sequence of ESC 'N' */
1352 c1 = 'N';
1353 goto label_escape_sequence;
1354
1355 case ISO_single_shift_3:
1356 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1357 goto label_invalid_code;
1358 /* SS2 is handled as an escape sequence of ESC 'O' */
1359 c1 = 'O';
1360 goto label_escape_sequence;
1361
1362 case ISO_control_sequence_introducer:
1363 /* CSI is handled as an escape sequence of ESC '[' ... */
1364 c1 = '[';
1365 goto label_escape_sequence;
1366
1367 case ISO_escape:
1368 ONE_MORE_BYTE (c1);
1369 label_escape_sequence:
1370 /* Escape sequences handled by Emacs are invocation,
1371 designation, direction specification, and character
1372 composition specification. */
1373 switch (c1)
1374 {
1375 case '&': /* revision of following character set */
1376 ONE_MORE_BYTE (c1);
1377 if (!(c1 >= '@' && c1 <= '~'))
1378 goto label_invalid_code;
1379 ONE_MORE_BYTE (c1);
1380 if (c1 != ISO_CODE_ESC)
1381 goto label_invalid_code;
1382 ONE_MORE_BYTE (c1);
1383 goto label_escape_sequence;
1384
1385 case '$': /* designation of 2-byte character set */
1386 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1387 goto label_invalid_code;
1388 ONE_MORE_BYTE (c1);
1389 if (c1 >= '@' && c1 <= 'B')
1390 { /* designation of JISX0208.1978, GB2312.1980,
1391 or JISX0208.1980 */
1392 DECODE_DESIGNATION (0, 2, 94, c1);
1393 }
1394 else if (c1 >= 0x28 && c1 <= 0x2B)
1395 { /* designation of DIMENSION2_CHARS94 character set */
1396 ONE_MORE_BYTE (c2);
1397 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1398 }
1399 else if (c1 >= 0x2C && c1 <= 0x2F)
1400 { /* designation of DIMENSION2_CHARS96 character set */
1401 ONE_MORE_BYTE (c2);
1402 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1403 }
1404 else
1405 goto label_invalid_code;
1406 /* We must update these variables now. */
1407 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1408 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1409 continue;
1410
1411 case 'n': /* invocation of locking-shift-2 */
1412 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1413 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1414 goto label_invalid_code;
1415 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1416 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1417 continue;
1418
1419 case 'o': /* invocation of locking-shift-3 */
1420 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1421 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1422 goto label_invalid_code;
1423 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1424 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1425 continue;
1426
1427 case 'N': /* invocation of single-shift-2 */
1428 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1429 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1430 goto label_invalid_code;
1431 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1432 ONE_MORE_BYTE (c1);
1433 break;
1434
1435 case 'O': /* invocation of single-shift-3 */
1436 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1437 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1438 goto label_invalid_code;
1439 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1440 ONE_MORE_BYTE (c1);
1441 break;
1442
1443 case '0': case '2': case '3': case '4': /* start composition */
1444 DECODE_COMPOSITION_START (c1);
1445 continue;
1446
1447 case '1': /* end composition */
1448 DECODE_COMPOSITION_END (c1);
1449 continue;
1450
1451 case '[': /* specification of direction */
1452 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1453 goto label_invalid_code;
1454 /* For the moment, nested direction is not supported.
1455 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1456 left-to-right, and nozero means right-to-left. */
1457 ONE_MORE_BYTE (c1);
1458 switch (c1)
1459 {
1460 case ']': /* end of the current direction */
1461 coding->mode &= ~CODING_MODE_DIRECTION;
1462
1463 case '0': /* end of the current direction */
1464 case '1': /* start of left-to-right direction */
1465 ONE_MORE_BYTE (c1);
1466 if (c1 == ']')
1467 coding->mode &= ~CODING_MODE_DIRECTION;
1468 else
1469 goto label_invalid_code;
1470 break;
1471
1472 case '2': /* start of right-to-left direction */
1473 ONE_MORE_BYTE (c1);
1474 if (c1 == ']')
1475 coding->mode |= CODING_MODE_DIRECTION;
1476 else
1477 goto label_invalid_code;
1478 break;
1479
1480 default:
1481 goto label_invalid_code;
1482 }
1483 continue;
1484
1485 default:
1486 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1487 goto label_invalid_code;
1488 if (c1 >= 0x28 && c1 <= 0x2B)
1489 { /* designation of DIMENSION1_CHARS94 character set */
1490 ONE_MORE_BYTE (c2);
1491 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1492 }
1493 else if (c1 >= 0x2C && c1 <= 0x2F)
1494 { /* designation of DIMENSION1_CHARS96 character set */
1495 ONE_MORE_BYTE (c2);
1496 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1497 }
1498 else
1499 goto label_invalid_code;
1500 /* We must update these variables now. */
1501 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1502 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1503 continue;
1504 }
1505 }
1506
1507 /* Now we know CHARSET and 1st position code C1 of a character.
1508 Produce a multibyte sequence for that character while getting
1509 2nd position code C2 if necessary. */
1510 if (CHARSET_DIMENSION (charset) == 2)
1511 {
1512 ONE_MORE_BYTE (c2);
1513 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1514 /* C2 is not in a valid range. */
1515 goto label_invalid_code;
1516 }
1517 c = DECODE_ISO_CHARACTER (charset, c1, c2);
1518 EMIT_CHAR (c);
1519 continue;
1520
1521 label_invalid_code:
1522 coding->errors++;
1523 if (COMPOSING_P (coding))
1524 DECODE_COMPOSITION_END ('1');
1525 src = src_base;
1526 c = *src++;
1527 EMIT_CHAR (c);
1528 }
1529
1530 label_end_of_loop:
1531 coding->consumed = coding->consumed_char = src_base - source;
1532 coding->produced = dst - destination;
1533 return;
1534 }
1535
1536
1537 /* ISO2022 encoding stuff. */
1538
1539 /*
1540 It is not enough to say just "ISO2022" on encoding, we have to
1541 specify more details. In Emacs, each coding system of ISO2022
1542 variant has the following specifications:
1543 1. Initial designation to G0 thru G3.
1544 2. Allows short-form designation?
1545 3. ASCII should be designated to G0 before control characters?
1546 4. ASCII should be designated to G0 at end of line?
1547 5. 7-bit environment or 8-bit environment?
1548 6. Use locking-shift?
1549 7. Use Single-shift?
1550 And the following two are only for Japanese:
1551 8. Use ASCII in place of JIS0201-1976-Roman?
1552 9. Use JISX0208-1983 in place of JISX0208-1978?
1553 These specifications are encoded in `coding->flags' as flag bits
1554 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1555 details.
1556 */
1557
1558 /* Produce codes (escape sequence) for designating CHARSET to graphic
1559 register REG at DST, and increment DST. If <final-char> of CHARSET is
1560 '@', 'A', or 'B' and the coding system CODING allows, produce
1561 designation sequence of short-form. */
1562
1563 #define ENCODE_DESIGNATION(charset, reg, coding) \
1564 do { \
1565 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1566 char *intermediate_char_94 = "()*+"; \
1567 char *intermediate_char_96 = ",-./"; \
1568 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1569 \
1570 if (revision < 255) \
1571 { \
1572 *dst++ = ISO_CODE_ESC; \
1573 *dst++ = '&'; \
1574 *dst++ = '@' + revision; \
1575 } \
1576 *dst++ = ISO_CODE_ESC; \
1577 if (CHARSET_DIMENSION (charset) == 1) \
1578 { \
1579 if (CHARSET_CHARS (charset) == 94) \
1580 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1581 else \
1582 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1583 } \
1584 else \
1585 { \
1586 *dst++ = '$'; \
1587 if (CHARSET_CHARS (charset) == 94) \
1588 { \
1589 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1590 || reg != 0 \
1591 || final_char < '@' || final_char > 'B') \
1592 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1593 } \
1594 else \
1595 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1596 } \
1597 *dst++ = final_char; \
1598 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1599 } while (0)
1600
1601 /* The following two macros produce codes (control character or escape
1602 sequence) for ISO2022 single-shift functions (single-shift-2 and
1603 single-shift-3). */
1604
1605 #define ENCODE_SINGLE_SHIFT_2 \
1606 do { \
1607 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1608 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1609 else \
1610 *dst++ = ISO_CODE_SS2; \
1611 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1612 } while (0)
1613
1614 #define ENCODE_SINGLE_SHIFT_3 \
1615 do { \
1616 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1617 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1618 else \
1619 *dst++ = ISO_CODE_SS3; \
1620 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1621 } while (0)
1622
1623 /* The following four macros produce codes (control character or
1624 escape sequence) for ISO2022 locking-shift functions (shift-in,
1625 shift-out, locking-shift-2, and locking-shift-3). */
1626
1627 #define ENCODE_SHIFT_IN \
1628 do { \
1629 *dst++ = ISO_CODE_SI; \
1630 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1631 } while (0)
1632
1633 #define ENCODE_SHIFT_OUT \
1634 do { \
1635 *dst++ = ISO_CODE_SO; \
1636 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1637 } while (0)
1638
1639 #define ENCODE_LOCKING_SHIFT_2 \
1640 do { \
1641 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1642 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1643 } while (0)
1644
1645 #define ENCODE_LOCKING_SHIFT_3 \
1646 do { \
1647 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1648 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1649 } while (0)
1650
1651 /* Produce codes for a DIMENSION1 character whose character set is
1652 CHARSET and whose position-code is C1. Designation and invocation
1653 sequences are also produced in advance if necessary. */
1654
1655 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1656 do { \
1657 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1658 { \
1659 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1660 *dst++ = c1 & 0x7F; \
1661 else \
1662 *dst++ = c1 | 0x80; \
1663 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1664 break; \
1665 } \
1666 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1667 { \
1668 *dst++ = c1 & 0x7F; \
1669 break; \
1670 } \
1671 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1672 { \
1673 *dst++ = c1 | 0x80; \
1674 break; \
1675 } \
1676 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1677 && !coding->safe_charsets[charset]) \
1678 { \
1679 /* We should not encode this character, instead produce one or \
1680 two `?'s. */ \
1681 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1682 if (CHARSET_WIDTH (charset) == 2) \
1683 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1684 break; \
1685 } \
1686 else \
1687 /* Since CHARSET is not yet invoked to any graphic planes, we \
1688 must invoke it, or, at first, designate it to some graphic \
1689 register. Then repeat the loop to actually produce the \
1690 character. */ \
1691 dst = encode_invocation_designation (charset, coding, dst); \
1692 } while (1)
1693
1694 /* Produce codes for a DIMENSION2 character whose character set is
1695 CHARSET and whose position-codes are C1 and C2. Designation and
1696 invocation codes are also produced in advance if necessary. */
1697
1698 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1699 do { \
1700 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1701 { \
1702 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1703 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1704 else \
1705 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1706 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1707 break; \
1708 } \
1709 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1710 { \
1711 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1712 break; \
1713 } \
1714 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1715 { \
1716 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1717 break; \
1718 } \
1719 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1720 && !coding->safe_charsets[charset]) \
1721 { \
1722 /* We should not encode this character, instead produce one or \
1723 two `?'s. */ \
1724 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1725 if (CHARSET_WIDTH (charset) == 2) \
1726 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1727 break; \
1728 } \
1729 else \
1730 /* Since CHARSET is not yet invoked to any graphic planes, we \
1731 must invoke it, or, at first, designate it to some graphic \
1732 register. Then repeat the loop to actually produce the \
1733 character. */ \
1734 dst = encode_invocation_designation (charset, coding, dst); \
1735 } while (1)
1736
1737 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1738 do { \
1739 int alt_charset = charset; \
1740 \
1741 if (CHARSET_DEFINED_P (charset)) \
1742 { \
1743 if (CHARSET_DIMENSION (charset) == 1) \
1744 { \
1745 if (charset == CHARSET_ASCII \
1746 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1747 alt_charset = charset_latin_jisx0201; \
1748 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1749 } \
1750 else \
1751 { \
1752 if (charset == charset_jisx0208 \
1753 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1754 alt_charset = charset_jisx0208_1978; \
1755 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1756 } \
1757 } \
1758 else \
1759 { \
1760 *dst++ = c1; \
1761 if (c2 >= 0) \
1762 *dst++ = c2; \
1763 } \
1764 } while (0)
1765
1766 /* Produce designation and invocation codes at a place pointed by DST
1767 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1768 Return new DST. */
1769
1770 unsigned char *
1771 encode_invocation_designation (charset, coding, dst)
1772 int charset;
1773 struct coding_system *coding;
1774 unsigned char *dst;
1775 {
1776 int reg; /* graphic register number */
1777
1778 /* At first, check designations. */
1779 for (reg = 0; reg < 4; reg++)
1780 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1781 break;
1782
1783 if (reg >= 4)
1784 {
1785 /* CHARSET is not yet designated to any graphic registers. */
1786 /* At first check the requested designation. */
1787 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1788 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1789 /* Since CHARSET requests no special designation, designate it
1790 to graphic register 0. */
1791 reg = 0;
1792
1793 ENCODE_DESIGNATION (charset, reg, coding);
1794 }
1795
1796 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1797 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1798 {
1799 /* Since the graphic register REG is not invoked to any graphic
1800 planes, invoke it to graphic plane 0. */
1801 switch (reg)
1802 {
1803 case 0: /* graphic register 0 */
1804 ENCODE_SHIFT_IN;
1805 break;
1806
1807 case 1: /* graphic register 1 */
1808 ENCODE_SHIFT_OUT;
1809 break;
1810
1811 case 2: /* graphic register 2 */
1812 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1813 ENCODE_SINGLE_SHIFT_2;
1814 else
1815 ENCODE_LOCKING_SHIFT_2;
1816 break;
1817
1818 case 3: /* graphic register 3 */
1819 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1820 ENCODE_SINGLE_SHIFT_3;
1821 else
1822 ENCODE_LOCKING_SHIFT_3;
1823 break;
1824 }
1825 }
1826
1827 return dst;
1828 }
1829
1830 /* Produce 2-byte codes for encoded composition rule RULE. */
1831
1832 #define ENCODE_COMPOSITION_RULE(rule) \
1833 do { \
1834 int gref, nref; \
1835 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1836 *dst++ = 32 + 81 + gref; \
1837 *dst++ = 32 + nref; \
1838 } while (0)
1839
1840 /* Produce codes for indicating the start of a composition sequence
1841 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1842 which specify information about the composition. See the comment
1843 in coding.h for the format of DATA. */
1844
1845 #define ENCODE_COMPOSITION_START(coding, data) \
1846 do { \
1847 coding->composing = data[3]; \
1848 *dst++ = ISO_CODE_ESC; \
1849 if (coding->composing == COMPOSITION_RELATIVE) \
1850 *dst++ = '0'; \
1851 else \
1852 { \
1853 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1854 ? '3' : '4'); \
1855 coding->cmp_data_index = coding->cmp_data_start + 4; \
1856 coding->composition_rule_follows = 0; \
1857 } \
1858 } while (0)
1859
1860 /* Produce codes for indicating the end of the current composition. */
1861
1862 #define ENCODE_COMPOSITION_END(coding, data) \
1863 do { \
1864 *dst++ = ISO_CODE_ESC; \
1865 *dst++ = '1'; \
1866 coding->cmp_data_start += data[0]; \
1867 coding->composing = COMPOSITION_NO; \
1868 if (coding->cmp_data_start == coding->cmp_data->used \
1869 && coding->cmp_data->next) \
1870 { \
1871 coding->cmp_data = coding->cmp_data->next; \
1872 coding->cmp_data_start = 0; \
1873 } \
1874 } while (0)
1875
1876 /* Produce composition start sequence ESC 0. Here, this sequence
1877 doesn't mean the start of a new composition but means that we have
1878 just produced components (alternate chars and composition rules) of
1879 the composition and the actual text follows in SRC. */
1880
1881 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1882 do { \
1883 *dst++ = ISO_CODE_ESC; \
1884 *dst++ = '0'; \
1885 coding->composing = COMPOSITION_RELATIVE; \
1886 } while (0)
1887
1888 /* The following three macros produce codes for indicating direction
1889 of text. */
1890 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1891 do { \
1892 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1893 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1894 else \
1895 *dst++ = ISO_CODE_CSI; \
1896 } while (0)
1897
1898 #define ENCODE_DIRECTION_R2L \
1899 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1900
1901 #define ENCODE_DIRECTION_L2R \
1902 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1903
1904 /* Produce codes for designation and invocation to reset the graphic
1905 planes and registers to initial state. */
1906 #define ENCODE_RESET_PLANE_AND_REGISTER \
1907 do { \
1908 int reg; \
1909 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1910 ENCODE_SHIFT_IN; \
1911 for (reg = 0; reg < 4; reg++) \
1912 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1913 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1914 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1915 ENCODE_DESIGNATION \
1916 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1917 } while (0)
1918
1919 /* Produce designation sequences of charsets in the line started from
1920 SRC to a place pointed by DST, and return updated DST.
1921
1922 If the current block ends before any end-of-line, we may fail to
1923 find all the necessary designations. */
1924
1925 static unsigned char *
1926 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1927 struct coding_system *coding;
1928 Lisp_Object translation_table;
1929 unsigned char *src, *src_end, *dst;
1930 {
1931 int charset, c, found = 0, reg;
1932 /* Table of charsets to be designated to each graphic register. */
1933 int r[4];
1934
1935 for (reg = 0; reg < 4; reg++)
1936 r[reg] = -1;
1937
1938 while (found < 4)
1939 {
1940 ONE_MORE_CHAR (c);
1941 if (c == '\n')
1942 break;
1943
1944 charset = CHAR_CHARSET (c);
1945 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1946 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1947 {
1948 found++;
1949 r[reg] = charset;
1950 }
1951 }
1952
1953 label_end_of_loop:
1954 if (found)
1955 {
1956 for (reg = 0; reg < 4; reg++)
1957 if (r[reg] >= 0
1958 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1959 ENCODE_DESIGNATION (r[reg], reg, coding);
1960 }
1961
1962 return dst;
1963 }
1964
1965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1966
1967 static void
1968 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1969 struct coding_system *coding;
1970 unsigned char *source, *destination;
1971 int src_bytes, dst_bytes;
1972 {
1973 unsigned char *src = source;
1974 unsigned char *src_end = source + src_bytes;
1975 unsigned char *dst = destination;
1976 unsigned char *dst_end = destination + dst_bytes;
1977 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1978 from DST_END to assure overflow checking is necessary only at the
1979 head of loop. */
1980 unsigned char *adjusted_dst_end = dst_end - 19;
1981 /* SRC_BASE remembers the start position in source in each loop.
1982 The loop will be exited when there's not enough source text to
1983 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1984 there's not enough destination area to produce encoded codes
1985 (within macro EMIT_BYTES). */
1986 unsigned char *src_base;
1987 int c;
1988 Lisp_Object translation_table;
1989
1990 if (NILP (Venable_character_translation))
1991 translation_table = Qnil;
1992 else
1993 {
1994 translation_table = coding->translation_table_for_encode;
1995 if (NILP (translation_table))
1996 translation_table = Vstandard_translation_table_for_encode;
1997 }
1998
1999 coding->consumed_char = 0;
2000 coding->errors = 0;
2001 while (1)
2002 {
2003 int charset, c1, c2;
2004
2005 src_base = src;
2006
2007 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2008 {
2009 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2010 break;
2011 }
2012
2013 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2014 && CODING_SPEC_ISO_BOL (coding))
2015 {
2016 /* We have to produce designation sequences if any now. */
2017 dst = encode_designation_at_bol (coding, translation_table,
2018 src, src_end, dst);
2019 CODING_SPEC_ISO_BOL (coding) = 0;
2020 }
2021
2022 /* Check composition start and end. */
2023 if (coding->composing != COMPOSITION_DISABLED
2024 && coding->cmp_data_start < coding->cmp_data->used)
2025 {
2026 struct composition_data *cmp_data = coding->cmp_data;
2027 int *data = cmp_data->data + coding->cmp_data_start;
2028 int this_pos = cmp_data->char_offset + coding->consumed_char;
2029
2030 if (coding->composing == COMPOSITION_RELATIVE)
2031 {
2032 if (this_pos == data[2])
2033 {
2034 ENCODE_COMPOSITION_END (coding, data);
2035 cmp_data = coding->cmp_data;
2036 data = cmp_data->data + coding->cmp_data_start;
2037 }
2038 }
2039 else if (COMPOSING_P (coding))
2040 {
2041 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2042 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2043 /* We have consumed components of the composition.
2044 What follows in SRC is the compositions's base
2045 text. */
2046 ENCODE_COMPOSITION_FAKE_START (coding);
2047 else
2048 {
2049 int c = cmp_data->data[coding->cmp_data_index++];
2050 if (coding->composition_rule_follows)
2051 {
2052 ENCODE_COMPOSITION_RULE (c);
2053 coding->composition_rule_follows = 0;
2054 }
2055 else
2056 {
2057 SPLIT_CHAR (c, charset, c1, c2);
2058 ENCODE_ISO_CHARACTER (charset, c1, c2);
2059 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2060 coding->composition_rule_follows = 1;
2061 }
2062 continue;
2063 }
2064 }
2065 if (!COMPOSING_P (coding))
2066 {
2067 if (this_pos == data[1])
2068 {
2069 ENCODE_COMPOSITION_START (coding, data);
2070 continue;
2071 }
2072 }
2073 }
2074
2075 ONE_MORE_CHAR (c);
2076
2077 /* Now encode the character C. */
2078 if (c < 0x20 || c == 0x7F)
2079 {
2080 if (c == '\r')
2081 {
2082 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2083 {
2084 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2085 ENCODE_RESET_PLANE_AND_REGISTER;
2086 *dst++ = c;
2087 continue;
2088 }
2089 /* fall down to treat '\r' as '\n' ... */
2090 c = '\n';
2091 }
2092 if (c == '\n')
2093 {
2094 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2095 ENCODE_RESET_PLANE_AND_REGISTER;
2096 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2097 bcopy (coding->spec.iso2022.initial_designation,
2098 coding->spec.iso2022.current_designation,
2099 sizeof coding->spec.iso2022.initial_designation);
2100 if (coding->eol_type == CODING_EOL_LF
2101 || coding->eol_type == CODING_EOL_UNDECIDED)
2102 *dst++ = ISO_CODE_LF;
2103 else if (coding->eol_type == CODING_EOL_CRLF)
2104 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2105 else
2106 *dst++ = ISO_CODE_CR;
2107 CODING_SPEC_ISO_BOL (coding) = 1;
2108 }
2109 else
2110 {
2111 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2112 ENCODE_RESET_PLANE_AND_REGISTER;
2113 *dst++ = c;
2114 }
2115 }
2116 else if (ASCII_BYTE_P (c))
2117 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2118 else if (SINGLE_BYTE_CHAR_P (c))
2119 {
2120 *dst++ = c;
2121 coding->errors++;
2122 }
2123 else
2124 {
2125 SPLIT_CHAR (c, charset, c1, c2);
2126 ENCODE_ISO_CHARACTER (charset, c1, c2);
2127 }
2128
2129 coding->consumed_char++;
2130 }
2131
2132 label_end_of_loop:
2133 coding->consumed = src_base - source;
2134 coding->produced = coding->produced_char = dst - destination;
2135 }
2136
2137 \f
2138 /*** 4. SJIS and BIG5 handlers ***/
2139
2140 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2141 quite widely. So, for the moment, Emacs supports them in the bare
2142 C code. But, in the future, they may be supported only by CCL. */
2143
2144 /* SJIS is a coding system encoding three character sets: ASCII, right
2145 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2146 as is. A character of charset katakana-jisx0201 is encoded by
2147 "position-code + 0x80". A character of charset japanese-jisx0208
2148 is encoded in 2-byte but two position-codes are divided and shifted
2149 so that it fit in the range below.
2150
2151 --- CODE RANGE of SJIS ---
2152 (character set) (range)
2153 ASCII 0x00 .. 0x7F
2154 KATAKANA-JISX0201 0xA0 .. 0xDF
2155 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2156 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2157 -------------------------------
2158
2159 */
2160
2161 /* BIG5 is a coding system encoding two character sets: ASCII and
2162 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2163 character set and is encoded in two-byte.
2164
2165 --- CODE RANGE of BIG5 ---
2166 (character set) (range)
2167 ASCII 0x00 .. 0x7F
2168 Big5 (1st byte) 0xA1 .. 0xFE
2169 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2170 --------------------------
2171
2172 Since the number of characters in Big5 is larger than maximum
2173 characters in Emacs' charset (96x96), it can't be handled as one
2174 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2175 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2176 contains frequently used characters and the latter contains less
2177 frequently used characters. */
2178
2179 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2180 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2181 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2182 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2183
2184 /* Number of Big5 characters which have the same code in 1st byte. */
2185 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2186
2187 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2188 do { \
2189 unsigned int temp \
2190 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2191 if (b1 < 0xC9) \
2192 charset = charset_big5_1; \
2193 else \
2194 { \
2195 charset = charset_big5_2; \
2196 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2197 } \
2198 c1 = temp / (0xFF - 0xA1) + 0x21; \
2199 c2 = temp % (0xFF - 0xA1) + 0x21; \
2200 } while (0)
2201
2202 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2203 do { \
2204 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2205 if (charset == charset_big5_2) \
2206 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2207 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2208 b2 = temp % BIG5_SAME_ROW; \
2209 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2210 } while (0)
2211
2212 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2213 Check if a text is encoded in SJIS. If it is, return
2214 CODING_CATEGORY_MASK_SJIS, else return 0. */
2215
2216 int
2217 detect_coding_sjis (src, src_end)
2218 unsigned char *src, *src_end;
2219 {
2220 int c;
2221 /* Dummy for ONE_MORE_BYTE. */
2222 struct coding_system dummy_coding;
2223 struct coding_system *coding = &dummy_coding;
2224
2225 while (1)
2226 {
2227 ONE_MORE_BYTE (c);
2228 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2229 {
2230 ONE_MORE_BYTE (c);
2231 if (c < 0x40)
2232 return 0;
2233 }
2234 }
2235 label_end_of_loop:
2236 return CODING_CATEGORY_MASK_SJIS;
2237 }
2238
2239 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2240 Check if a text is encoded in BIG5. If it is, return
2241 CODING_CATEGORY_MASK_BIG5, else return 0. */
2242
2243 int
2244 detect_coding_big5 (src, src_end)
2245 unsigned char *src, *src_end;
2246 {
2247 int c;
2248 /* Dummy for ONE_MORE_BYTE. */
2249 struct coding_system dummy_coding;
2250 struct coding_system *coding = &dummy_coding;
2251
2252 while (1)
2253 {
2254 ONE_MORE_BYTE (c);
2255 if (c >= 0xA1)
2256 {
2257 ONE_MORE_BYTE (c);
2258 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2259 return 0;
2260 }
2261 }
2262 label_end_of_loop:
2263 return CODING_CATEGORY_MASK_BIG5;
2264 }
2265
2266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2267 Check if a text is encoded in UTF-8. If it is, return
2268 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2269
2270 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2271 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2276 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2277
2278 int
2279 detect_coding_utf_8 (src, src_end)
2280 unsigned char *src, *src_end;
2281 {
2282 unsigned char c;
2283 int seq_maybe_bytes;
2284 /* Dummy for ONE_MORE_BYTE. */
2285 struct coding_system dummy_coding;
2286 struct coding_system *coding = &dummy_coding;
2287
2288 while (1)
2289 {
2290 ONE_MORE_BYTE (c);
2291 if (UTF_8_1_OCTET_P (c))
2292 continue;
2293 else if (UTF_8_2_OCTET_LEADING_P (c))
2294 seq_maybe_bytes = 1;
2295 else if (UTF_8_3_OCTET_LEADING_P (c))
2296 seq_maybe_bytes = 2;
2297 else if (UTF_8_4_OCTET_LEADING_P (c))
2298 seq_maybe_bytes = 3;
2299 else if (UTF_8_5_OCTET_LEADING_P (c))
2300 seq_maybe_bytes = 4;
2301 else if (UTF_8_6_OCTET_LEADING_P (c))
2302 seq_maybe_bytes = 5;
2303 else
2304 return 0;
2305
2306 do
2307 {
2308 ONE_MORE_BYTE (c);
2309 if (!UTF_8_EXTRA_OCTET_P (c))
2310 return 0;
2311 seq_maybe_bytes--;
2312 }
2313 while (seq_maybe_bytes > 0);
2314 }
2315
2316 label_end_of_loop:
2317 return CODING_CATEGORY_MASK_UTF_8;
2318 }
2319
2320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2321 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2322 Little Endian (otherwise). If it is, return
2323 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2324 else return 0. */
2325
2326 #define UTF_16_INVALID_P(val) \
2327 (((val) == 0xFFFE) \
2328 || ((val) == 0xFFFF))
2329
2330 #define UTF_16_HIGH_SURROGATE_P(val) \
2331 (((val) & 0xD800) == 0xD800)
2332
2333 #define UTF_16_LOW_SURROGATE_P(val) \
2334 (((val) & 0xDC00) == 0xDC00)
2335
2336 int
2337 detect_coding_utf_16 (src, src_end)
2338 unsigned char *src, *src_end;
2339 {
2340 unsigned char c1, c2;
2341 /* Dummy for TWO_MORE_BYTES. */
2342 struct coding_system dummy_coding;
2343 struct coding_system *coding = &dummy_coding;
2344
2345 TWO_MORE_BYTES (c1, c2);
2346
2347 if ((c1 == 0xFF) && (c2 == 0xFE))
2348 return CODING_CATEGORY_MASK_UTF_16_LE;
2349 else if ((c1 == 0xFE) && (c2 == 0xFF))
2350 return CODING_CATEGORY_MASK_UTF_16_BE;
2351
2352 label_end_of_loop:
2353 return 0;
2354 }
2355
2356 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2357 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2358
2359 static void
2360 decode_coding_sjis_big5 (coding, source, destination,
2361 src_bytes, dst_bytes, sjis_p)
2362 struct coding_system *coding;
2363 unsigned char *source, *destination;
2364 int src_bytes, dst_bytes;
2365 int sjis_p;
2366 {
2367 unsigned char *src = source;
2368 unsigned char *src_end = source + src_bytes;
2369 unsigned char *dst = destination;
2370 unsigned char *dst_end = destination + dst_bytes;
2371 /* SRC_BASE remembers the start position in source in each loop.
2372 The loop will be exited when there's not enough source code
2373 (within macro ONE_MORE_BYTE), or when there's not enough
2374 destination area to produce a character (within macro
2375 EMIT_CHAR). */
2376 unsigned char *src_base;
2377 Lisp_Object translation_table;
2378
2379 if (NILP (Venable_character_translation))
2380 translation_table = Qnil;
2381 else
2382 {
2383 translation_table = coding->translation_table_for_decode;
2384 if (NILP (translation_table))
2385 translation_table = Vstandard_translation_table_for_decode;
2386 }
2387
2388 coding->produced_char = 0;
2389 while (1)
2390 {
2391 int c, charset, c1, c2;
2392
2393 src_base = src;
2394 ONE_MORE_BYTE (c1);
2395
2396 if (c1 < 0x80)
2397 {
2398 charset = CHARSET_ASCII;
2399 if (c1 < 0x20)
2400 {
2401 if (c1 == '\r')
2402 {
2403 if (coding->eol_type == CODING_EOL_CRLF)
2404 {
2405 ONE_MORE_BYTE (c2);
2406 if (c2 == '\n')
2407 c1 = c2;
2408 else if (coding->mode
2409 & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2410 {
2411 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2412 goto label_end_of_loop;
2413 }
2414 else
2415 /* To process C2 again, SRC is subtracted by 1. */
2416 src--;
2417 }
2418 else if (coding->eol_type == CODING_EOL_CR)
2419 c1 = '\n';
2420 }
2421 else if (c1 == '\n'
2422 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2423 && (coding->eol_type == CODING_EOL_CR
2424 || coding->eol_type == CODING_EOL_CRLF))
2425 {
2426 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2427 goto label_end_of_loop;
2428 }
2429 }
2430 }
2431 else
2432 {
2433 if (sjis_p)
2434 {
2435 if (c1 >= 0xF0)
2436 goto label_invalid_code;
2437 if (c1 < 0xA0 || c1 >= 0xE0)
2438 {
2439 /* SJIS -> JISX0208 */
2440 ONE_MORE_BYTE (c2);
2441 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2442 goto label_invalid_code;
2443 DECODE_SJIS (c1, c2, c1, c2);
2444 charset = charset_jisx0208;
2445 }
2446 else
2447 /* SJIS -> JISX0201-Kana */
2448 charset = charset_katakana_jisx0201;
2449 }
2450 else
2451 {
2452 /* BIG5 -> Big5 */
2453 if (c1 < 0xA1 || c1 > 0xFE)
2454 goto label_invalid_code;
2455 ONE_MORE_BYTE (c2);
2456 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2457 goto label_invalid_code;
2458 DECODE_BIG5 (c1, c2, charset, c1, c2);
2459 }
2460 }
2461
2462 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2463 EMIT_CHAR (c);
2464 continue;
2465
2466 label_invalid_code:
2467 coding->errors++;
2468 src = src_base;
2469 c = *src++;
2470 EMIT_CHAR (c);
2471 }
2472
2473 label_end_of_loop:
2474 coding->consumed = coding->consumed_char = src_base - source;
2475 coding->produced = dst - destination;
2476 return;
2477 }
2478
2479 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2480 This function can encode charsets `ascii', `katakana-jisx0201',
2481 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2482 are sure that all these charsets are registered as official charset
2483 (i.e. do not have extended leading-codes). Characters of other
2484 charsets are produced without any encoding. If SJIS_P is 1, encode
2485 SJIS text, else encode BIG5 text. */
2486
2487 static void
2488 encode_coding_sjis_big5 (coding, source, destination,
2489 src_bytes, dst_bytes, sjis_p)
2490 struct coding_system *coding;
2491 unsigned char *source, *destination;
2492 int src_bytes, dst_bytes;
2493 int sjis_p;
2494 {
2495 unsigned char *src = source;
2496 unsigned char *src_end = source + src_bytes;
2497 unsigned char *dst = destination;
2498 unsigned char *dst_end = destination + dst_bytes;
2499 /* SRC_BASE remembers the start position in source in each loop.
2500 The loop will be exited when there's not enough source text to
2501 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2502 there's not enough destination area to produce encoded codes
2503 (within macro EMIT_BYTES). */
2504 unsigned char *src_base;
2505 Lisp_Object translation_table;
2506
2507 if (NILP (Venable_character_translation))
2508 translation_table = Qnil;
2509 else
2510 {
2511 translation_table = coding->translation_table_for_decode;
2512 if (NILP (translation_table))
2513 translation_table = Vstandard_translation_table_for_decode;
2514 }
2515
2516 while (1)
2517 {
2518 int c, charset, c1, c2;
2519
2520 src_base = src;
2521 ONE_MORE_CHAR (c);
2522
2523 /* Now encode the character C. */
2524 if (SINGLE_BYTE_CHAR_P (c))
2525 {
2526 switch (c)
2527 {
2528 case '\r':
2529 if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2530 {
2531 EMIT_ONE_BYTE (c);
2532 break;
2533 }
2534 c = '\n';
2535 case '\n':
2536 if (coding->eol_type == CODING_EOL_CRLF)
2537 {
2538 EMIT_TWO_BYTES ('\r', c);
2539 break;
2540 }
2541 else if (coding->eol_type == CODING_EOL_CR)
2542 c = '\r';
2543 default:
2544 EMIT_ONE_BYTE (c);
2545 }
2546 }
2547 else
2548 {
2549 SPLIT_CHAR (c, charset, c1, c2);
2550 if (sjis_p)
2551 {
2552 if (charset == charset_jisx0208
2553 || charset == charset_jisx0208_1978)
2554 {
2555 ENCODE_SJIS (c1, c2, c1, c2);
2556 EMIT_TWO_BYTES (c1, c2);
2557 }
2558 else if (charset == charset_latin_jisx0201)
2559 EMIT_ONE_BYTE (c1);
2560 else
2561 /* There's no way other than producing the internal
2562 codes as is. */
2563 EMIT_BYTES (src_base, src);
2564 }
2565 else
2566 {
2567 if (charset == charset_big5_1 || charset == charset_big5_2)
2568 {
2569 ENCODE_BIG5 (charset, c1, c2, c1, c2);
2570 EMIT_TWO_BYTES (c1, c2);
2571 }
2572 else
2573 /* There's no way other than producing the internal
2574 codes as is. */
2575 EMIT_BYTES (src_base, src);
2576 }
2577 }
2578 coding->consumed_char++;
2579 }
2580
2581 label_end_of_loop:
2582 coding->consumed = src_base - source;
2583 coding->produced = coding->produced_char = dst - destination;
2584 }
2585
2586 \f
2587 /*** 5. CCL handlers ***/
2588
2589 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2590 Check if a text is encoded in a coding system of which
2591 encoder/decoder are written in CCL program. If it is, return
2592 CODING_CATEGORY_MASK_CCL, else return 0. */
2593
2594 int
2595 detect_coding_ccl (src, src_end)
2596 unsigned char *src, *src_end;
2597 {
2598 unsigned char *valid;
2599 int c;
2600 /* Dummy for ONE_MORE_BYTE. */
2601 struct coding_system dummy_coding;
2602 struct coding_system *coding = &dummy_coding;
2603
2604 /* No coding system is assigned to coding-category-ccl. */
2605 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2606 return 0;
2607
2608 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2609 while (1)
2610 {
2611 ONE_MORE_BYTE (c);
2612 if (! valid[c])
2613 return 0;
2614 }
2615 label_end_of_loop:
2616 return CODING_CATEGORY_MASK_CCL;
2617 }
2618
2619 \f
2620 /*** 6. End-of-line handlers ***/
2621
2622 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2623
2624 static void
2625 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2626 struct coding_system *coding;
2627 unsigned char *source, *destination;
2628 int src_bytes, dst_bytes;
2629 {
2630 unsigned char *src = source;
2631 unsigned char *dst = destination;
2632 unsigned char *src_end = src + src_bytes;
2633 unsigned char *dst_end = dst + dst_bytes;
2634 Lisp_Object translation_table;
2635 /* SRC_BASE remembers the start position in source in each loop.
2636 The loop will be exited when there's not enough source code
2637 (within macro ONE_MORE_BYTE), or when there's not enough
2638 destination area to produce a character (within macro
2639 EMIT_CHAR). */
2640 unsigned char *src_base;
2641 int c;
2642
2643 translation_table = Qnil;
2644 switch (coding->eol_type)
2645 {
2646 case CODING_EOL_CRLF:
2647 while (1)
2648 {
2649 src_base = src;
2650 ONE_MORE_BYTE (c);
2651 if (c == '\r')
2652 {
2653 ONE_MORE_BYTE (c);
2654 if (c != '\n')
2655 {
2656 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2657 {
2658 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2659 goto label_end_of_loop;
2660 }
2661 src--;
2662 c = '\r';
2663 }
2664 }
2665 else if (c == '\n'
2666 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2667 {
2668 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2669 goto label_end_of_loop;
2670 }
2671 EMIT_CHAR (c);
2672 }
2673 break;
2674
2675 case CODING_EOL_CR:
2676 while (1)
2677 {
2678 src_base = src;
2679 ONE_MORE_BYTE (c);
2680 if (c == '\n')
2681 {
2682 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2683 {
2684 coding->result = CODING_FINISH_INCONSISTENT_EOL;
2685 goto label_end_of_loop;
2686 }
2687 }
2688 else if (c == '\r')
2689 c = '\n';
2690 EMIT_CHAR (c);
2691 }
2692 break;
2693
2694 default: /* no need for EOL handling */
2695 while (1)
2696 {
2697 src_base = src;
2698 ONE_MORE_BYTE (c);
2699 EMIT_CHAR (c);
2700 }
2701 }
2702
2703 label_end_of_loop:
2704 coding->consumed = coding->consumed_char = src_base - source;
2705 coding->produced = dst - destination;
2706 return;
2707 }
2708
2709 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2710 format of end-of-line according to `coding->eol_type'. It also
2711 convert multibyte form 8-bit characers to unibyte if
2712 CODING->src_multibyte is nonzero. If `coding->mode &
2713 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2714 also means end-of-line. */
2715
2716 static void
2717 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2718 struct coding_system *coding;
2719 unsigned char *source, *destination;
2720 int src_bytes, dst_bytes;
2721 {
2722 unsigned char *src = source;
2723 unsigned char *dst = destination;
2724 unsigned char *src_end = src + src_bytes;
2725 unsigned char *dst_end = dst + dst_bytes;
2726 Lisp_Object translation_table;
2727 /* SRC_BASE remembers the start position in source in each loop.
2728 The loop will be exited when there's not enough source text to
2729 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2730 there's not enough destination area to produce encoded codes
2731 (within macro EMIT_BYTES). */
2732 unsigned char *src_base;
2733 int c;
2734 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2735
2736 translation_table = Qnil;
2737 if (coding->src_multibyte
2738 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2739 {
2740 src_end--;
2741 src_bytes--;
2742 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2743 }
2744
2745 if (coding->eol_type == CODING_EOL_CRLF)
2746 {
2747 while (src < src_end)
2748 {
2749 src_base = src;
2750 c = *src++;
2751 if (c >= 0x20)
2752 EMIT_ONE_BYTE (c);
2753 else if (c == '\n' || (c == '\r' && selective_display))
2754 EMIT_TWO_BYTES ('\r', '\n');
2755 else
2756 EMIT_ONE_BYTE (c);
2757 }
2758 src_base = src;
2759 label_end_of_loop:
2760 ;
2761 }
2762 else
2763 {
2764 if (src_bytes <= dst_bytes)
2765 {
2766 safe_bcopy (src, dst, src_bytes);
2767 src_base = src_end;
2768 dst += src_bytes;
2769 }
2770 else
2771 {
2772 if (coding->src_multibyte
2773 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2774 dst_bytes--;
2775 safe_bcopy (src, dst, dst_bytes);
2776 src_base = src + dst_bytes;
2777 dst = destination + dst_bytes;
2778 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2779 }
2780 if (coding->eol_type == CODING_EOL_CR)
2781 {
2782 for (src = destination; src < dst; src++)
2783 if (*src == '\n') *src = '\r';
2784 }
2785 else if (selective_display)
2786 {
2787 for (src = destination; src < dst; src++)
2788 if (*src == '\r') *src = '\n';
2789 }
2790 }
2791 if (coding->src_multibyte)
2792 dst = destination + str_as_unibyte (destination, dst - destination);
2793
2794 coding->consumed = src_base - source;
2795 coding->produced = dst - destination;
2796 }
2797
2798 \f
2799 /*** 7. C library functions ***/
2800
2801 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2802 has a property `coding-system'. The value of this property is a
2803 vector of length 5 (called as coding-vector). Among elements of
2804 this vector, the first (element[0]) and the fifth (element[4])
2805 carry important information for decoding/encoding. Before
2806 decoding/encoding, this information should be set in fields of a
2807 structure of type `coding_system'.
2808
2809 A value of property `coding-system' can be a symbol of another
2810 subsidiary coding-system. In that case, Emacs gets coding-vector
2811 from that symbol.
2812
2813 `element[0]' contains information to be set in `coding->type'. The
2814 value and its meaning is as follows:
2815
2816 0 -- coding_type_emacs_mule
2817 1 -- coding_type_sjis
2818 2 -- coding_type_iso2022
2819 3 -- coding_type_big5
2820 4 -- coding_type_ccl encoder/decoder written in CCL
2821 nil -- coding_type_no_conversion
2822 t -- coding_type_undecided (automatic conversion on decoding,
2823 no-conversion on encoding)
2824
2825 `element[4]' contains information to be set in `coding->flags' and
2826 `coding->spec'. The meaning varies by `coding->type'.
2827
2828 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2829 of length 32 (of which the first 13 sub-elements are used now).
2830 Meanings of these sub-elements are:
2831
2832 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2833 If the value is an integer of valid charset, the charset is
2834 assumed to be designated to graphic register N initially.
2835
2836 If the value is minus, it is a minus value of charset which
2837 reserves graphic register N, which means that the charset is
2838 not designated initially but should be designated to graphic
2839 register N just before encoding a character in that charset.
2840
2841 If the value is nil, graphic register N is never used on
2842 encoding.
2843
2844 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2845 Each value takes t or nil. See the section ISO2022 of
2846 `coding.h' for more information.
2847
2848 If `coding->type' is `coding_type_big5', element[4] is t to denote
2849 BIG5-ETen or nil to denote BIG5-HKU.
2850
2851 If `coding->type' takes the other value, element[4] is ignored.
2852
2853 Emacs Lisp's coding system also carries information about format of
2854 end-of-line in a value of property `eol-type'. If the value is
2855 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2856 means CODING_EOL_CR. If it is not integer, it should be a vector
2857 of subsidiary coding systems of which property `eol-type' has one
2858 of above values.
2859
2860 */
2861
2862 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2863 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2864 is setup so that no conversion is necessary and return -1, else
2865 return 0. */
2866
2867 int
2868 setup_coding_system (coding_system, coding)
2869 Lisp_Object coding_system;
2870 struct coding_system *coding;
2871 {
2872 Lisp_Object coding_spec, coding_type, eol_type, plist;
2873 Lisp_Object val;
2874 int i;
2875
2876 /* Initialize some fields required for all kinds of coding systems. */
2877 coding->symbol = coding_system;
2878 coding->common_flags = 0;
2879 coding->mode = 0;
2880 coding->heading_ascii = -1;
2881 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2882 coding->composing = COMPOSITION_DISABLED;
2883 coding->cmp_data = NULL;
2884
2885 if (NILP (coding_system))
2886 goto label_invalid_coding_system;
2887
2888 coding_spec = Fget (coding_system, Qcoding_system);
2889
2890 if (!VECTORP (coding_spec)
2891 || XVECTOR (coding_spec)->size != 5
2892 || !CONSP (XVECTOR (coding_spec)->contents[3]))
2893 goto label_invalid_coding_system;
2894
2895 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2896 if (VECTORP (eol_type))
2897 {
2898 coding->eol_type = CODING_EOL_UNDECIDED;
2899 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2900 }
2901 else if (XFASTINT (eol_type) == 1)
2902 {
2903 coding->eol_type = CODING_EOL_CRLF;
2904 coding->common_flags
2905 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2906 }
2907 else if (XFASTINT (eol_type) == 2)
2908 {
2909 coding->eol_type = CODING_EOL_CR;
2910 coding->common_flags
2911 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912 }
2913 else
2914 coding->eol_type = CODING_EOL_LF;
2915
2916 coding_type = XVECTOR (coding_spec)->contents[0];
2917 /* Try short cut. */
2918 if (SYMBOLP (coding_type))
2919 {
2920 if (EQ (coding_type, Qt))
2921 {
2922 coding->type = coding_type_undecided;
2923 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2924 }
2925 else
2926 coding->type = coding_type_no_conversion;
2927 return 0;
2928 }
2929
2930 /* Get values of coding system properties:
2931 `post-read-conversion', `pre-write-conversion',
2932 `translation-table-for-decode', `translation-table-for-encode'. */
2933 plist = XVECTOR (coding_spec)->contents[3];
2934 /* Pre & post conversion functions should be disabled if
2935 inhibit_eol_conversion is nozero. This is the case that a code
2936 conversion function is called while those functions are running. */
2937 if (! inhibit_pre_post_conversion)
2938 {
2939 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2940 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2941 }
2942 val = Fplist_get (plist, Qtranslation_table_for_decode);
2943 if (SYMBOLP (val))
2944 val = Fget (val, Qtranslation_table_for_decode);
2945 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2946 val = Fplist_get (plist, Qtranslation_table_for_encode);
2947 if (SYMBOLP (val))
2948 val = Fget (val, Qtranslation_table_for_encode);
2949 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2950 val = Fplist_get (plist, Qcoding_category);
2951 if (!NILP (val))
2952 {
2953 val = Fget (val, Qcoding_category_index);
2954 if (INTEGERP (val))
2955 coding->category_idx = XINT (val);
2956 else
2957 goto label_invalid_coding_system;
2958 }
2959 else
2960 goto label_invalid_coding_system;
2961
2962 val = Fplist_get (plist, Qsafe_charsets);
2963 if (EQ (val, Qt))
2964 {
2965 for (i = 0; i <= MAX_CHARSET; i++)
2966 coding->safe_charsets[i] = 1;
2967 }
2968 else
2969 {
2970 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2971 while (CONSP (val))
2972 {
2973 if ((i = get_charset_id (XCAR (val))) >= 0)
2974 coding->safe_charsets[i] = 1;
2975 val = XCDR (val);
2976 }
2977 }
2978
2979 /* If the coding system has non-nil `composition' property, enable
2980 composition handling. */
2981 val = Fplist_get (plist, Qcomposition);
2982 if (!NILP (val))
2983 coding->composing = COMPOSITION_NO;
2984
2985 switch (XFASTINT (coding_type))
2986 {
2987 case 0:
2988 coding->type = coding_type_emacs_mule;
2989 if (!NILP (coding->post_read_conversion))
2990 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2991 if (!NILP (coding->pre_write_conversion))
2992 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2993 break;
2994
2995 case 1:
2996 coding->type = coding_type_sjis;
2997 coding->common_flags
2998 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2999 break;
3000
3001 case 2:
3002 coding->type = coding_type_iso2022;
3003 coding->common_flags
3004 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3005 {
3006 Lisp_Object val, temp;
3007 Lisp_Object *flags;
3008 int i, charset, reg_bits = 0;
3009
3010 val = XVECTOR (coding_spec)->contents[4];
3011
3012 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3013 goto label_invalid_coding_system;
3014
3015 flags = XVECTOR (val)->contents;
3016 coding->flags
3017 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3018 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3019 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3020 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3021 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3022 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3023 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3024 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3025 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3026 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3027 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3028 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3029 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3030 );
3031
3032 /* Invoke graphic register 0 to plane 0. */
3033 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3034 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3035 CODING_SPEC_ISO_INVOCATION (coding, 1)
3036 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3037 /* Not single shifting at first. */
3038 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3039 /* Beginning of buffer should also be regarded as bol. */
3040 CODING_SPEC_ISO_BOL (coding) = 1;
3041
3042 for (charset = 0; charset <= MAX_CHARSET; charset++)
3043 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3044 val = Vcharset_revision_alist;
3045 while (CONSP (val))
3046 {
3047 charset = get_charset_id (Fcar_safe (XCAR (val)));
3048 if (charset >= 0
3049 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3050 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3051 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3052 val = XCDR (val);
3053 }
3054
3055 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3056 FLAGS[REG] can be one of below:
3057 integer CHARSET: CHARSET occupies register I,
3058 t: designate nothing to REG initially, but can be used
3059 by any charsets,
3060 list of integer, nil, or t: designate the first
3061 element (if integer) to REG initially, the remaining
3062 elements (if integer) is designated to REG on request,
3063 if an element is t, REG can be used by any charsets,
3064 nil: REG is never used. */
3065 for (charset = 0; charset <= MAX_CHARSET; charset++)
3066 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3067 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3068 for (i = 0; i < 4; i++)
3069 {
3070 if (INTEGERP (flags[i])
3071 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3072 || (charset = get_charset_id (flags[i])) >= 0)
3073 {
3074 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3075 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3076 }
3077 else if (EQ (flags[i], Qt))
3078 {
3079 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3080 reg_bits |= 1 << i;
3081 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3082 }
3083 else if (CONSP (flags[i]))
3084 {
3085 Lisp_Object tail;
3086 tail = flags[i];
3087
3088 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3089 if (INTEGERP (XCAR (tail))
3090 && (charset = XINT (XCAR (tail)),
3091 CHARSET_VALID_P (charset))
3092 || (charset = get_charset_id (XCAR (tail))) >= 0)
3093 {
3094 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3095 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3096 }
3097 else
3098 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3099 tail = XCDR (tail);
3100 while (CONSP (tail))
3101 {
3102 if (INTEGERP (XCAR (tail))
3103 && (charset = XINT (XCAR (tail)),
3104 CHARSET_VALID_P (charset))
3105 || (charset = get_charset_id (XCAR (tail))) >= 0)
3106 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3107 = i;
3108 else if (EQ (XCAR (tail), Qt))
3109 reg_bits |= 1 << i;
3110 tail = XCDR (tail);
3111 }
3112 }
3113 else
3114 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3115
3116 CODING_SPEC_ISO_DESIGNATION (coding, i)
3117 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3118 }
3119
3120 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3121 {
3122 /* REG 1 can be used only by locking shift in 7-bit env. */
3123 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3124 reg_bits &= ~2;
3125 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3126 /* Without any shifting, only REG 0 and 1 can be used. */
3127 reg_bits &= 3;
3128 }
3129
3130 if (reg_bits)
3131 for (charset = 0; charset <= MAX_CHARSET; charset++)
3132 {
3133 if (CHARSET_VALID_P (charset))
3134 {
3135 /* There exist some default graphic registers to be
3136 used CHARSET. */
3137
3138 /* We had better avoid designating a charset of
3139 CHARS96 to REG 0 as far as possible. */
3140 if (CHARSET_CHARS (charset) == 96)
3141 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3142 = (reg_bits & 2
3143 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3144 else
3145 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3146 = (reg_bits & 1
3147 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3148 }
3149 }
3150 }
3151 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3152 coding->spec.iso2022.last_invalid_designation_register = -1;
3153 break;
3154
3155 case 3:
3156 coding->type = coding_type_big5;
3157 coding->common_flags
3158 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3159 coding->flags
3160 = (NILP (XVECTOR (coding_spec)->contents[4])
3161 ? CODING_FLAG_BIG5_HKU
3162 : CODING_FLAG_BIG5_ETEN);
3163 break;
3164
3165 case 4:
3166 coding->type = coding_type_ccl;
3167 coding->common_flags
3168 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3169 {
3170 val = XVECTOR (coding_spec)->contents[4];
3171 if (! CONSP (val)
3172 || setup_ccl_program (&(coding->spec.ccl.decoder),
3173 XCAR (val)) < 0
3174 || setup_ccl_program (&(coding->spec.ccl.encoder),
3175 XCDR (val)) < 0)
3176 goto label_invalid_coding_system;
3177
3178 bzero (coding->spec.ccl.valid_codes, 256);
3179 val = Fplist_get (plist, Qvalid_codes);
3180 if (CONSP (val))
3181 {
3182 Lisp_Object this;
3183
3184 for (; CONSP (val); val = XCDR (val))
3185 {
3186 this = XCAR (val);
3187 if (INTEGERP (this)
3188 && XINT (this) >= 0 && XINT (this) < 256)
3189 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3190 else if (CONSP (this)
3191 && INTEGERP (XCAR (this))
3192 && INTEGERP (XCDR (this)))
3193 {
3194 int start = XINT (XCAR (this));
3195 int end = XINT (XCDR (this));
3196
3197 if (start >= 0 && start <= end && end < 256)
3198 while (start <= end)
3199 coding->spec.ccl.valid_codes[start++] = 1;
3200 }
3201 }
3202 }
3203 }
3204 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3205 coding->spec.ccl.cr_carryover = 0;
3206 break;
3207
3208 case 5:
3209 coding->type = coding_type_raw_text;
3210 break;
3211
3212 default:
3213 goto label_invalid_coding_system;
3214 }
3215 return 0;
3216
3217 label_invalid_coding_system:
3218 coding->type = coding_type_no_conversion;
3219 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220 coding->common_flags = 0;
3221 coding->eol_type = CODING_EOL_LF;
3222 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223 return -1;
3224 }
3225
3226 /* Free memory blocks allocated for storing composition information. */
3227
3228 void
3229 coding_free_composition_data (coding)
3230 struct coding_system *coding;
3231 {
3232 struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234 if (!cmp_data)
3235 return;
3236 /* Memory blocks are chained. At first, rewind to the first, then,
3237 free blocks one by one. */
3238 while (cmp_data->prev)
3239 cmp_data = cmp_data->prev;
3240 while (cmp_data)
3241 {
3242 next = cmp_data->next;
3243 xfree (cmp_data);
3244 cmp_data = next;
3245 }
3246 coding->cmp_data = NULL;
3247 }
3248
3249 /* Set `char_offset' member of all memory blocks pointed by
3250 coding->cmp_data to POS. */
3251
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254 struct coding_system *coding;
3255 int pos;
3256 {
3257 struct composition_data *cmp_data;
3258
3259 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260 cmp_data->char_offset = pos;
3261 }
3262
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264 coding_system CODING according to the already setup value eol_type
3265 in CODING. CODING should be setup for some coding system in
3266 advance. */
3267
3268 void
3269 setup_raw_text_coding_system (coding)
3270 struct coding_system *coding;
3271 {
3272 if (coding->type != coding_type_raw_text)
3273 {
3274 coding->symbol = Qraw_text;
3275 coding->type = coding_type_raw_text;
3276 if (coding->eol_type != CODING_EOL_UNDECIDED)
3277 {
3278 Lisp_Object subsidiaries;
3279 subsidiaries = Fget (Qraw_text, Qeol_type);
3280
3281 if (VECTORP (subsidiaries)
3282 && XVECTOR (subsidiaries)->size == 3)
3283 coding->symbol
3284 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285 }
3286 setup_coding_system (coding->symbol, coding);
3287 }
3288 return;
3289 }
3290
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3293 it's impossible to distinguish some coding systems accurately
3294 because they use the same range of codes. So, at first, coding
3295 systems are categorized into 7, those are:
3296
3297 o coding-category-emacs-mule
3298
3299 The category for a coding system which has the same code range
3300 as Emacs' internal format. Assigned the coding-system (Lisp
3301 symbol) `emacs-mule' by default.
3302
3303 o coding-category-sjis
3304
3305 The category for a coding system which has the same code range
3306 as SJIS. Assigned the coding-system (Lisp
3307 symbol) `japanese-shift-jis' by default.
3308
3309 o coding-category-iso-7
3310
3311 The category for a coding system which has the same code range
3312 as ISO2022 of 7-bit environment. This doesn't use any locking
3313 shift and single shift functions. This can encode/decode all
3314 charsets. Assigned the coding-system (Lisp symbol)
3315 `iso-2022-7bit' by default.
3316
3317 o coding-category-iso-7-tight
3318
3319 Same as coding-category-iso-7 except that this can
3320 encode/decode only the specified charsets.
3321
3322 o coding-category-iso-8-1
3323
3324 The category for a coding system which has the same code range
3325 as ISO2022 of 8-bit environment and graphic plane 1 used only
3326 for DIMENSION1 charset. This doesn't use any locking shift
3327 and single shift functions. Assigned the coding-system (Lisp
3328 symbol) `iso-latin-1' by default.
3329
3330 o coding-category-iso-8-2
3331
3332 The category for a coding system which has the same code range
3333 as ISO2022 of 8-bit environment and graphic plane 1 used only
3334 for DIMENSION2 charset. This doesn't use any locking shift
3335 and single shift functions. Assigned the coding-system (Lisp
3336 symbol) `japanese-iso-8bit' by default.
3337
3338 o coding-category-iso-7-else
3339
3340 The category for a coding system which has the same code range
3341 as ISO2022 of 7-bit environemnt but uses locking shift or
3342 single shift functions. Assigned the coding-system (Lisp
3343 symbol) `iso-2022-7bit-lock' by default.
3344
3345 o coding-category-iso-8-else
3346
3347 The category for a coding system which has the same code range
3348 as ISO2022 of 8-bit environemnt but uses locking shift or
3349 single shift functions. Assigned the coding-system (Lisp
3350 symbol) `iso-2022-8bit-ss2' by default.
3351
3352 o coding-category-big5
3353
3354 The category for a coding system which has the same code range
3355 as BIG5. Assigned the coding-system (Lisp symbol)
3356 `cn-big5' by default.
3357
3358 o coding-category-utf-8
3359
3360 The category for a coding system which has the same code range
3361 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3362 symbol) `utf-8' by default.
3363
3364 o coding-category-utf-16-be
3365
3366 The category for a coding system in which a text has an
3367 Unicode signature (cf. Unicode Standard) in the order of BIG
3368 endian at the head. Assigned the coding-system (Lisp symbol)
3369 `utf-16-be' by default.
3370
3371 o coding-category-utf-16-le
3372
3373 The category for a coding system in which a text has an
3374 Unicode signature (cf. Unicode Standard) in the order of
3375 LITTLE endian at the head. Assigned the coding-system (Lisp
3376 symbol) `utf-16-le' by default.
3377
3378 o coding-category-ccl
3379
3380 The category for a coding system of which encoder/decoder is
3381 written in CCL programs. The default value is nil, i.e., no
3382 coding system is assigned.
3383
3384 o coding-category-binary
3385
3386 The category for a coding system not categorized in any of the
3387 above. Assigned the coding-system (Lisp symbol)
3388 `no-conversion' by default.
3389
3390 Each of them is a Lisp symbol and the value is an actual
3391 `coding-system's (this is also a Lisp symbol) assigned by a user.
3392 What Emacs does actually is to detect a category of coding system.
3393 Then, it uses a `coding-system' assigned to it. If Emacs can't
3394 decide only one possible category, it selects a category of the
3395 highest priority. Priorities of categories are also specified by a
3396 user in a Lisp variable `coding-category-list'.
3397
3398 */
3399
3400 static
3401 int ascii_skip_code[256];
3402
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404 If it detects possible coding systems, return an integer in which
3405 appropriate flag bits are set. Flag bits are defined by macros
3406 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3407 it should point the table `coding_priorities'. In that case, only
3408 the flag bit for a coding system of the highest priority is set in
3409 the returned value.
3410
3411 How many ASCII characters are at the head is returned as *SKIP. */
3412
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415 unsigned char *source;
3416 int src_bytes, *priorities, *skip;
3417 {
3418 register unsigned char c;
3419 unsigned char *src = source, *src_end = source + src_bytes;
3420 unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421 int i, idx;
3422
3423 /* At first, skip all ASCII characters and control characters except
3424 for three ISO2022 specific control characters. */
3425 ascii_skip_code[ISO_CODE_SO] = 0;
3426 ascii_skip_code[ISO_CODE_SI] = 0;
3427 ascii_skip_code[ISO_CODE_ESC] = 0;
3428
3429 label_loop_detect_coding:
3430 while (src < src_end && ascii_skip_code[*src]) src++;
3431 *skip = src - source;
3432
3433 if (src >= src_end)
3434 /* We found nothing other than ASCII. There's nothing to do. */
3435 return 0;
3436
3437 c = *src;
3438 /* The text seems to be encoded in some multilingual coding system.
3439 Now, try to find in which coding system the text is encoded. */
3440 if (c < 0x80)
3441 {
3442 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443 /* C is an ISO2022 specific control code of C0. */
3444 mask = detect_coding_iso2022 (src, src_end);
3445 if (mask == 0)
3446 {
3447 /* No valid ISO2022 code follows C. Try again. */
3448 src++;
3449 if (c == ISO_CODE_ESC)
3450 ascii_skip_code[ISO_CODE_ESC] = 1;
3451 else
3452 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453 goto label_loop_detect_coding;
3454 }
3455 if (priorities)
3456 {
3457 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458 {
3459 if (mask & priorities[i])
3460 return priorities[i];
3461 }
3462 return CODING_CATEGORY_MASK_RAW_TEXT;
3463 }
3464 }
3465 else
3466 {
3467 int try;
3468
3469 if (c < 0xA0)
3470 {
3471 /* C is the first byte of SJIS character code,
3472 or a leading-code of Emacs' internal format (emacs-mule),
3473 or the first byte of UTF-16. */
3474 try = (CODING_CATEGORY_MASK_SJIS
3475 | CODING_CATEGORY_MASK_EMACS_MULE
3476 | CODING_CATEGORY_MASK_UTF_16_BE
3477 | CODING_CATEGORY_MASK_UTF_16_LE);
3478
3479 /* Or, if C is a special latin extra code,
3480 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481 or is an ISO2022 control-sequence-introducer (CSI),
3482 we should also consider the possibility of ISO2022 codings. */
3483 if ((VECTORP (Vlatin_extra_code_table)
3484 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486 || (c == ISO_CODE_CSI
3487 && (src < src_end
3488 && (*src == ']'
3489 || ((*src == '0' || *src == '1' || *src == '2')
3490 && src + 1 < src_end
3491 && src[1] == ']')))))
3492 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493 | CODING_CATEGORY_MASK_ISO_8BIT);
3494 }
3495 else
3496 /* C is a character of ISO2022 in graphic plane right,
3497 or a SJIS's 1-byte character code (i.e. JISX0201),
3498 or the first byte of BIG5's 2-byte code,
3499 or the first byte of UTF-8/16. */
3500 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501 | CODING_CATEGORY_MASK_ISO_8BIT
3502 | CODING_CATEGORY_MASK_SJIS
3503 | CODING_CATEGORY_MASK_BIG5
3504 | CODING_CATEGORY_MASK_UTF_8
3505 | CODING_CATEGORY_MASK_UTF_16_BE
3506 | CODING_CATEGORY_MASK_UTF_16_LE);
3507
3508 /* Or, we may have to consider the possibility of CCL. */
3509 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511 ->spec.ccl.valid_codes)[c])
3512 try |= CODING_CATEGORY_MASK_CCL;
3513
3514 mask = 0;
3515 utf16_examined_p = iso2022_examined_p = 0;
3516 if (priorities)
3517 {
3518 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519 {
3520 if (!iso2022_examined_p
3521 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522 {
3523 mask |= detect_coding_iso2022 (src, src_end);
3524 iso2022_examined_p = 1;
3525 }
3526 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527 mask |= detect_coding_sjis (src, src_end);
3528 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529 mask |= detect_coding_utf_8 (src, src_end);
3530 else if (!utf16_examined_p
3531 && (priorities[i] & try &
3532 CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533 {
3534 mask |= detect_coding_utf_16 (src, src_end);
3535 utf16_examined_p = 1;
3536 }
3537 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538 mask |= detect_coding_big5 (src, src_end);
3539 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540 mask |= detect_coding_emacs_mule (src, src_end);
3541 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542 mask |= detect_coding_ccl (src, src_end);
3543 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546 mask |= CODING_CATEGORY_MASK_BINARY;
3547 if (mask & priorities[i])
3548 return priorities[i];
3549 }
3550 return CODING_CATEGORY_MASK_RAW_TEXT;
3551 }
3552 if (try & CODING_CATEGORY_MASK_ISO)
3553 mask |= detect_coding_iso2022 (src, src_end);
3554 if (try & CODING_CATEGORY_MASK_SJIS)
3555 mask |= detect_coding_sjis (src, src_end);
3556 if (try & CODING_CATEGORY_MASK_BIG5)
3557 mask |= detect_coding_big5 (src, src_end);
3558 if (try & CODING_CATEGORY_MASK_UTF_8)
3559 mask |= detect_coding_utf_8 (src, src_end);
3560 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561 mask |= detect_coding_utf_16 (src, src_end);
3562 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563 mask |= detect_coding_emacs_mule (src, src_end);
3564 if (try & CODING_CATEGORY_MASK_CCL)
3565 mask |= detect_coding_ccl (src, src_end);
3566 }
3567 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3568 }
3569
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571 The information of the detected coding system is set in CODING. */
3572
3573 void
3574 detect_coding (coding, src, src_bytes)
3575 struct coding_system *coding;
3576 unsigned char *src;
3577 int src_bytes;
3578 {
3579 unsigned int idx;
3580 int skip, mask, i;
3581 Lisp_Object val;
3582
3583 val = Vcoding_category_list;
3584 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585 coding->heading_ascii = skip;
3586
3587 if (!mask) return;
3588
3589 /* We found a single coding system of the highest priority in MASK. */
3590 idx = 0;
3591 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592 if (! mask)
3593 idx = CODING_CATEGORY_IDX_RAW_TEXT;
3594
3595 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597 if (coding->eol_type != CODING_EOL_UNDECIDED)
3598 {
3599 Lisp_Object tmp;
3600
3601 tmp = Fget (val, Qeol_type);
3602 if (VECTORP (tmp))
3603 val = XVECTOR (tmp)->contents[coding->eol_type];
3604 }
3605
3606 /* Setup this new coding system while preserving some slots. */
3607 {
3608 int src_multibyte = coding->src_multibyte;
3609 int dst_multibyte = coding->dst_multibyte;
3610
3611 setup_coding_system (val, coding);
3612 coding->src_multibyte = src_multibyte;
3613 coding->dst_multibyte = dst_multibyte;
3614 coding->heading_ascii = skip;
3615 }
3616 }
3617
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622 How many non-eol characters are at the head is returned as *SKIP. */
3623
3624 #define MAX_EOL_CHECK_COUNT 3
3625
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628 unsigned char *source;
3629 int src_bytes, *skip;
3630 {
3631 unsigned char *src = source, *src_end = src + src_bytes;
3632 unsigned char c;
3633 int total = 0; /* How many end-of-lines are found so far. */
3634 int eol_type = CODING_EOL_UNDECIDED;
3635 int this_eol_type;
3636
3637 *skip = 0;
3638
3639 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3640 {
3641 c = *src++;
3642 if (c == '\n' || c == '\r')
3643 {
3644 if (*skip == 0)
3645 *skip = src - 1 - source;
3646 total++;
3647 if (c == '\n')
3648 this_eol_type = CODING_EOL_LF;
3649 else if (src >= src_end || *src != '\n')
3650 this_eol_type = CODING_EOL_CR;
3651 else
3652 this_eol_type = CODING_EOL_CRLF, src++;
3653
3654 if (eol_type == CODING_EOL_UNDECIDED)
3655 /* This is the first end-of-line. */
3656 eol_type = this_eol_type;
3657 else if (eol_type != this_eol_type)
3658 {
3659 /* The found type is different from what found before. */
3660 eol_type = CODING_EOL_INCONSISTENT;
3661 break;
3662 }
3663 }
3664 }
3665
3666 if (*skip == 0)
3667 *skip = src_end - source;
3668 return eol_type;
3669 }
3670
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672 big-endian/little-endian format for coding systems utf-16-be and
3673 utf-16-le. */
3674
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677 unsigned char *source;
3678 int src_bytes, *skip;
3679 {
3680 unsigned char *src = source, *src_end = src + src_bytes;
3681 unsigned int c1, c2;
3682 int total = 0; /* How many end-of-lines are found so far. */
3683 int eol_type = CODING_EOL_UNDECIDED;
3684 int this_eol_type;
3685 int msb, lsb;
3686
3687 if (big_endian_p)
3688 msb = 0, lsb = 1;
3689 else
3690 msb = 1, lsb = 0;
3691
3692 *skip = 0;
3693
3694 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695 {
3696 c1 = (src[msb] << 8) | (src[lsb]);
3697 src += 2;
3698
3699 if (c1 == '\n' || c1 == '\r')
3700 {
3701 if (*skip == 0)
3702 *skip = src - 2 - source;
3703 total++;
3704 if (c1 == '\n')
3705 {
3706 this_eol_type = CODING_EOL_LF;
3707 }
3708 else
3709 {
3710 if ((src + 1) >= src_end)
3711 {
3712 this_eol_type = CODING_EOL_CR;
3713 }
3714 else
3715 {
3716 c2 = (src[msb] << 8) | (src[lsb]);
3717 if (c2 == '\n')
3718 this_eol_type = CODING_EOL_CRLF, src += 2;
3719 else
3720 this_eol_type = CODING_EOL_CR;
3721 }
3722 }
3723
3724 if (eol_type == CODING_EOL_UNDECIDED)
3725 /* This is the first end-of-line. */
3726 eol_type = this_eol_type;
3727 else if (eol_type != this_eol_type)
3728 {
3729 /* The found type is different from what found before. */
3730 eol_type = CODING_EOL_INCONSISTENT;
3731 break;
3732 }
3733 }
3734 }
3735
3736 if (*skip == 0)
3737 *skip = src_end - source;
3738 return eol_type;
3739 }
3740
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742 is encoded. If it detects an appropriate format of end-of-line, it
3743 sets the information in *CODING. */
3744
3745 void
3746 detect_eol (coding, src, src_bytes)
3747 struct coding_system *coding;
3748 unsigned char *src;
3749 int src_bytes;
3750 {
3751 Lisp_Object val;
3752 int skip;
3753 int eol_type;
3754
3755 switch (coding->category_idx)
3756 {
3757 case CODING_CATEGORY_IDX_UTF_16_BE:
3758 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759 break;
3760 case CODING_CATEGORY_IDX_UTF_16_LE:
3761 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762 break;
3763 default:
3764 eol_type = detect_eol_type (src, src_bytes, &skip);
3765 break;
3766 }
3767
3768 if (coding->heading_ascii > skip)
3769 coding->heading_ascii = skip;
3770 else
3771 skip = coding->heading_ascii;
3772
3773 if (eol_type == CODING_EOL_UNDECIDED)
3774 return;
3775 if (eol_type == CODING_EOL_INCONSISTENT)
3776 {
3777 #if 0
3778 /* This code is suppressed until we find a better way to
3779 distinguish raw text file and binary file. */
3780
3781 /* If we have already detected that the coding is raw-text, the
3782 coding should actually be no-conversion. */
3783 if (coding->type == coding_type_raw_text)
3784 {
3785 setup_coding_system (Qno_conversion, coding);
3786 return;
3787 }
3788 /* Else, let's decode only text code anyway. */
3789 #endif /* 0 */
3790 eol_type = CODING_EOL_LF;
3791 }
3792
3793 val = Fget (coding->symbol, Qeol_type);
3794 if (VECTORP (val) && XVECTOR (val)->size == 3)
3795 {
3796 int src_multibyte = coding->src_multibyte;
3797 int dst_multibyte = coding->dst_multibyte;
3798
3799 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800 coding->src_multibyte = src_multibyte;
3801 coding->dst_multibyte = dst_multibyte;
3802 coding->heading_ascii = skip;
3803 }
3804 }
3805
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
3808 #define DECODING_BUFFER_MAG(coding) \
3809 (coding->type == coding_type_iso2022 \
3810 ? 3 \
3811 : (coding->type == coding_type_ccl \
3812 ? coding->spec.ccl.decoder.buf_magnification \
3813 : 2))
3814
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816 SRC_BYTES of text encoded in CODING. */
3817
3818 int
3819 decoding_buffer_size (coding, src_bytes)
3820 struct coding_system *coding;
3821 int src_bytes;
3822 {
3823 return (src_bytes * DECODING_BUFFER_MAG (coding)
3824 + CONVERSION_BUFFER_EXTRA_ROOM);
3825 }
3826
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828 SRC_BYTES of text to CODING. */
3829
3830 int
3831 encoding_buffer_size (coding, src_bytes)
3832 struct coding_system *coding;
3833 int src_bytes;
3834 {
3835 int magnification;
3836
3837 if (coding->type == coding_type_ccl)
3838 magnification = coding->spec.ccl.encoder.buf_magnification;
3839 else if (CODING_REQUIRE_ENCODING (coding))
3840 magnification = 3;
3841 else
3842 magnification = 1;
3843
3844 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845 }
3846
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3850
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3853
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855 or decoding. Sufficient memory is allocated automatically. If we
3856 run out of memory, return NULL. */
3857
3858 char *
3859 get_conversion_buffer (size)
3860 int size;
3861 {
3862 if (size > conversion_buffer_size)
3863 {
3864 char *buf;
3865 int real_size = conversion_buffer_size * 2;
3866
3867 while (real_size < size) real_size *= 2;
3868 buf = (char *) xmalloc (real_size);
3869 xfree (conversion_buffer);
3870 conversion_buffer = buf;
3871 conversion_buffer_size = real_size;
3872 }
3873 return conversion_buffer;
3874 }
3875
3876 int
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878 struct coding_system *coding;
3879 unsigned char *source, *destination;
3880 int src_bytes, dst_bytes, encodep;
3881 {
3882 struct ccl_program *ccl
3883 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884 int result;
3885
3886 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887 if (encodep)
3888 ccl->eol_type = coding->eol_type;
3889 coding->produced = ccl_driver (ccl, source, destination,
3890 src_bytes, dst_bytes, &(coding->consumed));
3891 if (encodep)
3892 coding->produced_char = coding->produced;
3893 else
3894 {
3895 int bytes
3896 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897 coding->produced = str_as_multibyte (destination, bytes,
3898 coding->produced,
3899 &(coding->produced_char));
3900 }
3901
3902 switch (ccl->status)
3903 {
3904 case CCL_STAT_SUSPEND_BY_SRC:
3905 result = CODING_FINISH_INSUFFICIENT_SRC;
3906 break;
3907 case CCL_STAT_SUSPEND_BY_DST:
3908 result = CODING_FINISH_INSUFFICIENT_DST;
3909 break;
3910 case CCL_STAT_QUIT:
3911 case CCL_STAT_INVALID_CMD:
3912 result = CODING_FINISH_INTERRUPT;
3913 break;
3914 default:
3915 result = CODING_FINISH_NORMAL;
3916 break;
3917 }
3918 return result;
3919 }
3920
3921 /* Decode EOL format of the text at PTR of BYTES length destructively
3922 according to CODING->eol_type. This is called after the CCL
3923 program produced a decoded text at PTR. If we do CRLF->LF
3924 conversion, update CODING->produced and CODING->produced_char. */
3925
3926 static void
3927 decode_eol_post_ccl (coding, ptr, bytes)
3928 struct coding_system *coding;
3929 unsigned char *ptr;
3930 int bytes;
3931 {
3932 Lisp_Object val, saved_coding_symbol;
3933 unsigned char *pend = ptr + bytes;
3934 int dummy;
3935
3936 /* Remember the current coding system symbol. We set it back when
3937 an inconsistent EOL is found so that `last-coding-system-used' is
3938 set to the coding system that doesn't specify EOL conversion. */
3939 saved_coding_symbol = coding->symbol;
3940
3941 coding->spec.ccl.cr_carryover = 0;
3942 if (coding->eol_type == CODING_EOL_UNDECIDED)
3943 {
3944 /* Here, to avoid the call of setup_coding_system, we directly
3945 call detect_eol_type. */
3946 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3947 if (coding->eol_type == CODING_EOL_INCONSISTENT)
3948 coding->eol_type = CODING_EOL_LF;
3949 if (coding->eol_type != CODING_EOL_UNDECIDED)
3950 {
3951 val = Fget (coding->symbol, Qeol_type);
3952 if (VECTORP (val) && XVECTOR (val)->size == 3)
3953 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3954 }
3955 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3956 }
3957
3958 if (coding->eol_type == CODING_EOL_LF
3959 || coding->eol_type == CODING_EOL_UNDECIDED)
3960 {
3961 /* We have nothing to do. */
3962 ptr = pend;
3963 }
3964 else if (coding->eol_type == CODING_EOL_CRLF)
3965 {
3966 unsigned char *pstart = ptr, *p = ptr;
3967
3968 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3969 && *(pend - 1) == '\r')
3970 {
3971 /* If the last character is CR, we can't handle it here
3972 because LF will be in the not-yet-decoded source text.
3973 Recorded that the CR is not yet processed. */
3974 coding->spec.ccl.cr_carryover = 1;
3975 coding->produced--;
3976 coding->produced_char--;
3977 pend--;
3978 }
3979 while (ptr < pend)
3980 {
3981 if (*ptr == '\r')
3982 {
3983 if (ptr + 1 < pend && *(ptr + 1) == '\n')
3984 {
3985 *p++ = '\n';
3986 ptr += 2;
3987 }
3988 else
3989 {
3990 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3991 goto undo_eol_conversion;
3992 *p++ = *ptr++;
3993 }
3994 }
3995 else if (*ptr == '\n'
3996 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3997 goto undo_eol_conversion;
3998 else
3999 *p++ = *ptr++;
4000 continue;
4001
4002 undo_eol_conversion:
4003 /* We have faced with inconsistent EOL format at PTR.
4004 Convert all LFs before PTR back to CRLFs. */
4005 for (p--, ptr--; p >= pstart; p--)
4006 {
4007 if (*p == '\n')
4008 *ptr-- = '\n', *ptr-- = '\r';
4009 else
4010 *ptr-- = *p;
4011 }
4012 /* If carryover is recorded, cancel it because we don't
4013 convert CRLF anymore. */
4014 if (coding->spec.ccl.cr_carryover)
4015 {
4016 coding->spec.ccl.cr_carryover = 0;
4017 coding->produced++;
4018 coding->produced_char++;
4019 pend++;
4020 }
4021 p = ptr = pend;
4022 coding->eol_type = CODING_EOL_LF;
4023 coding->symbol = saved_coding_symbol;
4024 }
4025 if (p < pend)
4026 {
4027 /* As each two-byte sequence CRLF was converted to LF, (PEND
4028 - P) is the number of deleted characters. */
4029 coding->produced -= pend - p;
4030 coding->produced_char -= pend - p;
4031 }
4032 }
4033 else /* i.e. coding->eol_type == CODING_EOL_CR */
4034 {
4035 unsigned char *p = ptr;
4036
4037 for (; ptr < pend; ptr++)
4038 {
4039 if (*ptr == '\r')
4040 *ptr = '\n';
4041 else if (*ptr == '\n'
4042 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4043 {
4044 for (; p < ptr; p++)
4045 {
4046 if (*p == '\n')
4047 *p = '\r';
4048 }
4049 ptr = pend;
4050 coding->eol_type = CODING_EOL_LF;
4051 coding->symbol = saved_coding_symbol;
4052 }
4053 }
4054 }
4055 }
4056
4057 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4058 decoding, it may detect coding system and format of end-of-line if
4059 those are not yet decided. The source should be unibyte, the
4060 result is multibyte if CODING->dst_multibyte is nonzero, else
4061 unibyte. */
4062
4063 int
4064 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4065 struct coding_system *coding;
4066 unsigned char *source, *destination;
4067 int src_bytes, dst_bytes;
4068 {
4069 if (coding->type == coding_type_undecided)
4070 detect_coding (coding, source, src_bytes);
4071
4072 if (coding->eol_type == CODING_EOL_UNDECIDED
4073 && coding->type != coding_type_ccl)
4074 detect_eol (coding, source, src_bytes);
4075
4076 coding->produced = coding->produced_char = 0;
4077 coding->consumed = coding->consumed_char = 0;
4078 coding->errors = 0;
4079 coding->result = CODING_FINISH_NORMAL;
4080
4081 switch (coding->type)
4082 {
4083 case coding_type_sjis:
4084 decode_coding_sjis_big5 (coding, source, destination,
4085 src_bytes, dst_bytes, 1);
4086 break;
4087
4088 case coding_type_iso2022:
4089 decode_coding_iso2022 (coding, source, destination,
4090 src_bytes, dst_bytes);
4091 break;
4092
4093 case coding_type_big5:
4094 decode_coding_sjis_big5 (coding, source, destination,
4095 src_bytes, dst_bytes, 0);
4096 break;
4097
4098 case coding_type_emacs_mule:
4099 decode_coding_emacs_mule (coding, source, destination,
4100 src_bytes, dst_bytes);
4101 break;
4102
4103 case coding_type_ccl:
4104 if (coding->spec.ccl.cr_carryover)
4105 {
4106 /* Set the CR which is not processed by the previous call of
4107 decode_eol_post_ccl in DESTINATION. */
4108 *destination = '\r';
4109 coding->produced++;
4110 coding->produced_char++;
4111 dst_bytes--;
4112 }
4113 ccl_coding_driver (coding, source,
4114 destination + coding->spec.ccl.cr_carryover,
4115 src_bytes, dst_bytes, 0);
4116 if (coding->eol_type != CODING_EOL_LF)
4117 decode_eol_post_ccl (coding, destination, coding->produced);
4118 break;
4119
4120 default:
4121 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4122 }
4123
4124 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4125 && coding->consumed == src_bytes)
4126 coding->result = CODING_FINISH_NORMAL;
4127
4128 if (coding->mode & CODING_MODE_LAST_BLOCK
4129 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4130 {
4131 unsigned char *src = source + coding->consumed;
4132 unsigned char *dst = destination + coding->produced;
4133
4134 src_bytes -= coding->consumed;
4135 coding->errors++;
4136 if (COMPOSING_P (coding))
4137 DECODE_COMPOSITION_END ('1');
4138 while (src_bytes--)
4139 {
4140 int c = *src++;
4141 dst += CHAR_STRING (c, dst);
4142 coding->produced_char++;
4143 }
4144 coding->consumed = coding->consumed_char = src - source;
4145 coding->produced = dst - destination;
4146 }
4147
4148 if (!coding->dst_multibyte)
4149 {
4150 coding->produced = str_as_unibyte (destination, coding->produced);
4151 coding->produced_char = coding->produced;
4152 }
4153
4154 return coding->result;
4155 }
4156
4157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4158 multibyteness of the source is CODING->src_multibyte, the
4159 multibyteness of the result is always unibyte. */
4160
4161 int
4162 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4163 struct coding_system *coding;
4164 unsigned char *source, *destination;
4165 int src_bytes, dst_bytes;
4166 {
4167 coding->produced = coding->produced_char = 0;
4168 coding->consumed = coding->consumed_char = 0;
4169 coding->errors = 0;
4170 coding->result = CODING_FINISH_NORMAL;
4171
4172 switch (coding->type)
4173 {
4174 case coding_type_sjis:
4175 encode_coding_sjis_big5 (coding, source, destination,
4176 src_bytes, dst_bytes, 1);
4177 break;
4178
4179 case coding_type_iso2022:
4180 encode_coding_iso2022 (coding, source, destination,
4181 src_bytes, dst_bytes);
4182 break;
4183
4184 case coding_type_big5:
4185 encode_coding_sjis_big5 (coding, source, destination,
4186 src_bytes, dst_bytes, 0);
4187 break;
4188
4189 case coding_type_emacs_mule:
4190 encode_coding_emacs_mule (coding, source, destination,
4191 src_bytes, dst_bytes);
4192 break;
4193
4194 case coding_type_ccl:
4195 ccl_coding_driver (coding, source, destination,
4196 src_bytes, dst_bytes, 1);
4197 break;
4198
4199 default:
4200 encode_eol (coding, source, destination, src_bytes, dst_bytes);
4201 }
4202
4203 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4204 && coding->consumed == src_bytes)
4205 coding->result = CODING_FINISH_NORMAL;
4206
4207 if (coding->mode & CODING_MODE_LAST_BLOCK)
4208 {
4209 unsigned char *src = source + coding->consumed;
4210 unsigned char *src_end = src + src_bytes;
4211 unsigned char *dst = destination + coding->produced;
4212
4213 if (coding->type == coding_type_iso2022)
4214 ENCODE_RESET_PLANE_AND_REGISTER;
4215 if (COMPOSING_P (coding))
4216 *dst++ = ISO_CODE_ESC, *dst++ = '1';
4217 if (coding->consumed < src_bytes)
4218 {
4219 int len = src_bytes - coding->consumed;
4220
4221 BCOPY_SHORT (source + coding->consumed, dst, len);
4222 if (coding->src_multibyte)
4223 len = str_as_unibyte (dst, len);
4224 dst += len;
4225 coding->consumed = src_bytes;
4226 }
4227 coding->produced = coding->produced_char = dst - destination;
4228 }
4229
4230 return coding->result;
4231 }
4232
4233 /* Scan text in the region between *BEG and *END (byte positions),
4234 skip characters which we don't have to decode by coding system
4235 CODING at the head and tail, then set *BEG and *END to the region
4236 of the text we actually have to convert. The caller should move
4237 the gap out of the region in advance if the region is from a
4238 buffer.
4239
4240 If STR is not NULL, *BEG and *END are indices into STR. */
4241
4242 static void
4243 shrink_decoding_region (beg, end, coding, str)
4244 int *beg, *end;
4245 struct coding_system *coding;
4246 unsigned char *str;
4247 {
4248 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4249 int eol_conversion;
4250 Lisp_Object translation_table;
4251
4252 if (coding->type == coding_type_ccl
4253 || coding->type == coding_type_undecided
4254 || coding->eol_type != CODING_EOL_LF
4255 || !NILP (coding->post_read_conversion)
4256 || coding->composing != COMPOSITION_DISABLED)
4257 {
4258 /* We can't skip any data. */
4259 return;
4260 }
4261 if (coding->type == coding_type_no_conversion
4262 || coding->type == coding_type_raw_text
4263 || coding->type == coding_type_emacs_mule)
4264 {
4265 /* We need no conversion, but don't have to skip any data here.
4266 Decoding routine handles them effectively anyway. */
4267 return;
4268 }
4269
4270 translation_table = coding->translation_table_for_decode;
4271 if (NILP (translation_table) && !NILP (Venable_character_translation))
4272 translation_table = Vstandard_translation_table_for_decode;
4273 if (CHAR_TABLE_P (translation_table))
4274 {
4275 int i;
4276 for (i = 0; i < 128; i++)
4277 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4278 break;
4279 if (i < 128)
4280 /* Some ASCII character should be translated. We give up
4281 shrinking. */
4282 return;
4283 }
4284
4285 if (coding->heading_ascii >= 0)
4286 /* Detection routine has already found how much we can skip at the
4287 head. */
4288 *beg += coding->heading_ascii;
4289
4290 if (str)
4291 {
4292 begp_orig = begp = str + *beg;
4293 endp_orig = endp = str + *end;
4294 }
4295 else
4296 {
4297 begp_orig = begp = BYTE_POS_ADDR (*beg);
4298 endp_orig = endp = begp + *end - *beg;
4299 }
4300
4301 eol_conversion = (coding->eol_type == CODING_EOL_CR
4302 || coding->eol_type == CODING_EOL_CRLF);
4303
4304 switch (coding->type)
4305 {
4306 case coding_type_sjis:
4307 case coding_type_big5:
4308 /* We can skip all ASCII characters at the head. */
4309 if (coding->heading_ascii < 0)
4310 {
4311 if (eol_conversion)
4312 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4313 else
4314 while (begp < endp && *begp < 0x80) begp++;
4315 }
4316 /* We can skip all ASCII characters at the tail except for the
4317 second byte of SJIS or BIG5 code. */
4318 if (eol_conversion)
4319 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4320 else
4321 while (begp < endp && endp[-1] < 0x80) endp--;
4322 /* Do not consider LF as ascii if preceded by CR, since that
4323 confuses eol decoding. */
4324 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4325 endp++;
4326 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4327 endp++;
4328 break;
4329
4330 case coding_type_iso2022:
4331 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4332 /* We can't skip any data. */
4333 break;
4334 if (coding->heading_ascii < 0)
4335 {
4336 /* We can skip all ASCII characters at the head except for a
4337 few control codes. */
4338 while (begp < endp && (c = *begp) < 0x80
4339 && c != ISO_CODE_CR && c != ISO_CODE_SO
4340 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4341 && (!eol_conversion || c != ISO_CODE_LF))
4342 begp++;
4343 }
4344 switch (coding->category_idx)
4345 {
4346 case CODING_CATEGORY_IDX_ISO_8_1:
4347 case CODING_CATEGORY_IDX_ISO_8_2:
4348 /* We can skip all ASCII characters at the tail. */
4349 if (eol_conversion)
4350 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4351 else
4352 while (begp < endp && endp[-1] < 0x80) endp--;
4353 /* Do not consider LF as ascii if preceded by CR, since that
4354 confuses eol decoding. */
4355 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4356 endp++;
4357 break;
4358
4359 case CODING_CATEGORY_IDX_ISO_7:
4360 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4361 {
4362 /* We can skip all charactes at the tail except for 8-bit
4363 codes and ESC and the following 2-byte at the tail. */
4364 unsigned char *eight_bit = NULL;
4365
4366 if (eol_conversion)
4367 while (begp < endp
4368 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4369 {
4370 if (!eight_bit && c & 0x80) eight_bit = endp;
4371 endp--;
4372 }
4373 else
4374 while (begp < endp
4375 && (c = endp[-1]) != ISO_CODE_ESC)
4376 {
4377 if (!eight_bit && c & 0x80) eight_bit = endp;
4378 endp--;
4379 }
4380 /* Do not consider LF as ascii if preceded by CR, since that
4381 confuses eol decoding. */
4382 if (begp < endp && endp < endp_orig
4383 && endp[-1] == '\r' && endp[0] == '\n')
4384 endp++;
4385 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4386 {
4387 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4388 /* This is an ASCII designation sequence. We can
4389 surely skip the tail. But, if we have
4390 encountered an 8-bit code, skip only the codes
4391 after that. */
4392 endp = eight_bit ? eight_bit : endp + 2;
4393 else
4394 /* Hmmm, we can't skip the tail. */
4395 endp = endp_orig;
4396 }
4397 else if (eight_bit)
4398 endp = eight_bit;
4399 }
4400 }
4401 break;
4402
4403 default:
4404 abort ();
4405 }
4406 *beg += begp - begp_orig;
4407 *end += endp - endp_orig;
4408 return;
4409 }
4410
4411 /* Like shrink_decoding_region but for encoding. */
4412
4413 static void
4414 shrink_encoding_region (beg, end, coding, str)
4415 int *beg, *end;
4416 struct coding_system *coding;
4417 unsigned char *str;
4418 {
4419 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4420 int eol_conversion;
4421 Lisp_Object translation_table;
4422
4423 if (coding->type == coding_type_ccl
4424 || coding->eol_type == CODING_EOL_CRLF
4425 || coding->eol_type == CODING_EOL_CR
4426 || coding->cmp_data && coding->cmp_data->used > 0)
4427 {
4428 /* We can't skip any data. */
4429 return;
4430 }
4431 if (coding->type == coding_type_no_conversion
4432 || coding->type == coding_type_raw_text
4433 || coding->type == coding_type_emacs_mule
4434 || coding->type == coding_type_undecided)
4435 {
4436 /* We need no conversion, but don't have to skip any data here.
4437 Encoding routine handles them effectively anyway. */
4438 return;
4439 }
4440
4441 translation_table = coding->translation_table_for_encode;
4442 if (NILP (translation_table) && !NILP (Venable_character_translation))
4443 translation_table = Vstandard_translation_table_for_encode;
4444 if (CHAR_TABLE_P (translation_table))
4445 {
4446 int i;
4447 for (i = 0; i < 128; i++)
4448 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4449 break;
4450 if (i < 128)
4451 /* Some ASCII character should be tranlsated. We give up
4452 shrinking. */
4453 return;
4454 }
4455
4456 if (str)
4457 {
4458 begp_orig = begp = str + *beg;
4459 endp_orig = endp = str + *end;
4460 }
4461 else
4462 {
4463 begp_orig = begp = BYTE_POS_ADDR (*beg);
4464 endp_orig = endp = begp + *end - *beg;
4465 }
4466
4467 eol_conversion = (coding->eol_type == CODING_EOL_CR
4468 || coding->eol_type == CODING_EOL_CRLF);
4469
4470 /* Here, we don't have to check coding->pre_write_conversion because
4471 the caller is expected to have handled it already. */
4472 switch (coding->type)
4473 {
4474 case coding_type_iso2022:
4475 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4476 /* We can't skip any data. */
4477 break;
4478 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4479 {
4480 unsigned char *bol = begp;
4481 while (begp < endp && *begp < 0x80)
4482 {
4483 begp++;
4484 if (begp[-1] == '\n')
4485 bol = begp;
4486 }
4487 begp = bol;
4488 goto label_skip_tail;
4489 }
4490 /* fall down ... */
4491
4492 case coding_type_sjis:
4493 case coding_type_big5:
4494 /* We can skip all ASCII characters at the head and tail. */
4495 if (eol_conversion)
4496 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4497 else
4498 while (begp < endp && *begp < 0x80) begp++;
4499 label_skip_tail:
4500 if (eol_conversion)
4501 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4502 else
4503 while (begp < endp && *(endp - 1) < 0x80) endp--;
4504 break;
4505
4506 default:
4507 abort ();
4508 }
4509
4510 *beg += begp - begp_orig;
4511 *end += endp - endp_orig;
4512 return;
4513 }
4514
4515 /* As shrinking conversion region requires some overhead, we don't try
4516 shrinking if the length of conversion region is less than this
4517 value. */
4518 static int shrink_conversion_region_threshhold = 1024;
4519
4520 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4521 do { \
4522 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4523 { \
4524 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4525 else shrink_decoding_region (beg, end, coding, str); \
4526 } \
4527 } while (0)
4528
4529 static Lisp_Object
4530 code_convert_region_unwind (dummy)
4531 Lisp_Object dummy;
4532 {
4533 inhibit_pre_post_conversion = 0;
4534 return Qnil;
4535 }
4536
4537 /* Store information about all compositions in the range FROM and TO
4538 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4539 buffer or a string, defaults to the current buffer. */
4540
4541 void
4542 coding_save_composition (coding, from, to, obj)
4543 struct coding_system *coding;
4544 int from, to;
4545 Lisp_Object obj;
4546 {
4547 Lisp_Object prop;
4548 int start, end;
4549
4550 if (coding->composing == COMPOSITION_DISABLED)
4551 return;
4552 if (!coding->cmp_data)
4553 coding_allocate_composition_data (coding, from);
4554 if (!find_composition (from, to, &start, &end, &prop, obj)
4555 || end > to)
4556 return;
4557 if (start < from
4558 && (!find_composition (end, to, &start, &end, &prop, obj)
4559 || end > to))
4560 return;
4561 coding->composing = COMPOSITION_NO;
4562 do
4563 {
4564 if (COMPOSITION_VALID_P (start, end, prop))
4565 {
4566 enum composition_method method = COMPOSITION_METHOD (prop);
4567 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4568 >= COMPOSITION_DATA_SIZE)
4569 coding_allocate_composition_data (coding, from);
4570 /* For relative composition, we remember start and end
4571 positions, for the other compositions, we also remember
4572 components. */
4573 CODING_ADD_COMPOSITION_START (coding, start - from, method);
4574 if (method != COMPOSITION_RELATIVE)
4575 {
4576 /* We must store a*/
4577 Lisp_Object val, ch;
4578
4579 val = COMPOSITION_COMPONENTS (prop);
4580 if (CONSP (val))
4581 while (CONSP (val))
4582 {
4583 ch = XCAR (val), val = XCDR (val);
4584 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4585 }
4586 else if (VECTORP (val) || STRINGP (val))
4587 {
4588 int len = (VECTORP (val)
4589 ? XVECTOR (val)->size : XSTRING (val)->size);
4590 int i;
4591 for (i = 0; i < len; i++)
4592 {
4593 ch = (STRINGP (val)
4594 ? Faref (val, make_number (i))
4595 : XVECTOR (val)->contents[i]);
4596 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4597 }
4598 }
4599 else /* INTEGERP (val) */
4600 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4601 }
4602 CODING_ADD_COMPOSITION_END (coding, end - from);
4603 }
4604 start = end;
4605 }
4606 while (start < to
4607 && find_composition (start, to, &start, &end, &prop, obj)
4608 && end <= to);
4609
4610 /* Make coding->cmp_data point to the first memory block. */
4611 while (coding->cmp_data->prev)
4612 coding->cmp_data = coding->cmp_data->prev;
4613 coding->cmp_data_start = 0;
4614 }
4615
4616 /* Reflect the saved information about compositions to OBJ.
4617 CODING->cmp_data points to a memory block for the informaiton. OBJ
4618 is a buffer or a string, defaults to the current buffer. */
4619
4620 void
4621 coding_restore_composition (coding, obj)
4622 struct coding_system *coding;
4623 Lisp_Object obj;
4624 {
4625 struct composition_data *cmp_data = coding->cmp_data;
4626
4627 if (!cmp_data)
4628 return;
4629
4630 while (cmp_data->prev)
4631 cmp_data = cmp_data->prev;
4632
4633 while (cmp_data)
4634 {
4635 int i;
4636
4637 for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4638 {
4639 int *data = cmp_data->data + i;
4640 enum composition_method method = (enum composition_method) data[3];
4641 Lisp_Object components;
4642
4643 if (method == COMPOSITION_RELATIVE)
4644 components = Qnil;
4645 else
4646 {
4647 int len = data[0] - 4, j;
4648 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4649
4650 for (j = 0; j < len; j++)
4651 args[j] = make_number (data[4 + j]);
4652 components = (method == COMPOSITION_WITH_ALTCHARS
4653 ? Fstring (len, args) : Fvector (len, args));
4654 }
4655 compose_text (data[1], data[2], components, Qnil, obj);
4656 }
4657 cmp_data = cmp_data->next;
4658 }
4659 }
4660
4661 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4662 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4663 coding system CODING, and return the status code of code conversion
4664 (currently, this value has no meaning).
4665
4666 How many characters (and bytes) are converted to how many
4667 characters (and bytes) are recorded in members of the structure
4668 CODING.
4669
4670 If REPLACE is nonzero, we do various things as if the original text
4671 is deleted and a new text is inserted. See the comments in
4672 replace_range (insdel.c) to know what we are doing.
4673
4674 If REPLACE is zero, it is assumed that the source text is unibyte.
4675 Otherwize, it is assumed that the source text is multibyte. */
4676
4677 int
4678 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4679 int from, from_byte, to, to_byte, encodep, replace;
4680 struct coding_system *coding;
4681 {
4682 int len = to - from, len_byte = to_byte - from_byte;
4683 int require, inserted, inserted_byte;
4684 int head_skip, tail_skip, total_skip = 0;
4685 Lisp_Object saved_coding_symbol;
4686 int first = 1;
4687 unsigned char *src, *dst;
4688 Lisp_Object deletion;
4689 int orig_point = PT, orig_len = len;
4690 int prev_Z;
4691 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4692
4693 coding->src_multibyte = replace && multibyte_p;
4694 coding->dst_multibyte = multibyte_p;
4695
4696 deletion = Qnil;
4697 saved_coding_symbol = Qnil;
4698
4699 if (from < PT && PT < to)
4700 {
4701 TEMP_SET_PT_BOTH (from, from_byte);
4702 orig_point = from;
4703 }
4704
4705 if (replace)
4706 {
4707 int saved_from = from;
4708
4709 prepare_to_modify_buffer (from, to, &from);
4710 if (saved_from != from)
4711 {
4712 to = from + len;
4713 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4714 len_byte = to_byte - from_byte;
4715 }
4716 }
4717
4718 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4719 {
4720 /* We must detect encoding of text and eol format. */
4721
4722 if (from < GPT && to > GPT)
4723 move_gap_both (from, from_byte);
4724 if (coding->type == coding_type_undecided)
4725 {
4726 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4727 if (coding->type == coding_type_undecided)
4728 /* It seems that the text contains only ASCII, but we
4729 should not left it undecided because the deeper
4730 decoding routine (decode_coding) tries to detect the
4731 encodings again in vain. */
4732 coding->type = coding_type_emacs_mule;
4733 }
4734 if (coding->eol_type == CODING_EOL_UNDECIDED
4735 && coding->type != coding_type_ccl)
4736 {
4737 saved_coding_symbol = coding->symbol;
4738 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4739 if (coding->eol_type == CODING_EOL_UNDECIDED)
4740 coding->eol_type = CODING_EOL_LF;
4741 /* We had better recover the original eol format if we
4742 encounter an inconsitent eol format while decoding. */
4743 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4744 }
4745 }
4746
4747 /* Now we convert the text. */
4748
4749 /* For encoding, we must process pre-write-conversion in advance. */
4750 if (! inhibit_pre_post_conversion
4751 && encodep
4752 && SYMBOLP (coding->pre_write_conversion)
4753 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4754 {
4755 /* The function in pre-write-conversion may put a new text in a
4756 new buffer. */
4757 struct buffer *prev = current_buffer;
4758 Lisp_Object new;
4759 int count = specpdl_ptr - specpdl;
4760
4761 record_unwind_protect (code_convert_region_unwind, Qnil);
4762 /* We should not call any more pre-write/post-read-conversion
4763 functions while this pre-write-conversion is running. */
4764 inhibit_pre_post_conversion = 1;
4765 call2 (coding->pre_write_conversion,
4766 make_number (from), make_number (to));
4767 inhibit_pre_post_conversion = 0;
4768 /* Discard the unwind protect. */
4769 specpdl_ptr--;
4770
4771 if (current_buffer != prev)
4772 {
4773 len = ZV - BEGV;
4774 new = Fcurrent_buffer ();
4775 set_buffer_internal_1 (prev);
4776 del_range_2 (from, from_byte, to, to_byte, 0);
4777 TEMP_SET_PT_BOTH (from, from_byte);
4778 insert_from_buffer (XBUFFER (new), 1, len, 0);
4779 Fkill_buffer (new);
4780 if (orig_point >= to)
4781 orig_point += len - orig_len;
4782 else if (orig_point > from)
4783 orig_point = from;
4784 orig_len = len;
4785 to = from + len;
4786 from_byte = CHAR_TO_BYTE (from);
4787 to_byte = CHAR_TO_BYTE (to);
4788 len_byte = to_byte - from_byte;
4789 TEMP_SET_PT_BOTH (from, from_byte);
4790 }
4791 }
4792
4793 if (replace)
4794 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4795
4796 if (coding->composing != COMPOSITION_DISABLED)
4797 {
4798 if (encodep)
4799 coding_save_composition (coding, from, to, Fcurrent_buffer ());
4800 else
4801 coding_allocate_composition_data (coding, from);
4802 }
4803
4804 /* Try to skip the heading and tailing ASCIIs. */
4805 if (coding->type != coding_type_ccl)
4806 {
4807 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4808
4809 if (from < GPT && GPT < to)
4810 move_gap_both (from, from_byte);
4811 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4812 if (from_byte == to_byte
4813 && (encodep || NILP (coding->post_read_conversion))
4814 && ! CODING_REQUIRE_FLUSHING (coding))
4815 {
4816 coding->produced = len_byte;
4817 coding->produced_char = len;
4818 if (!replace)
4819 /* We must record and adjust for this new text now. */
4820 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4821 return 0;
4822 }
4823
4824 head_skip = from_byte - from_byte_orig;
4825 tail_skip = to_byte_orig - to_byte;
4826 total_skip = head_skip + tail_skip;
4827 from += head_skip;
4828 to -= tail_skip;
4829 len -= total_skip; len_byte -= total_skip;
4830 }
4831
4832 /* The code conversion routine can not preserve text properties for
4833 now. So, we must remove all text properties in the region.
4834 Here, we must suppress all modification hooks. */
4835 if (replace)
4836 {
4837 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4838 inhibit_modification_hooks = 1;
4839 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4840 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4841 }
4842
4843 /* For converion, we must put the gap before the text in addition to
4844 making the gap larger for efficient decoding. The required gap
4845 size starts from 2000 which is the magic number used in make_gap.
4846 But, after one batch of conversion, it will be incremented if we
4847 find that it is not enough . */
4848 require = 2000;
4849
4850 if (GAP_SIZE < require)
4851 make_gap (require - GAP_SIZE);
4852 move_gap_both (from, from_byte);
4853
4854 inserted = inserted_byte = 0;
4855
4856 GAP_SIZE += len_byte;
4857 ZV -= len;
4858 Z -= len;
4859 ZV_BYTE -= len_byte;
4860 Z_BYTE -= len_byte;
4861
4862 if (GPT - BEG < BEG_UNCHANGED)
4863 BEG_UNCHANGED = GPT - BEG;
4864 if (Z - GPT < END_UNCHANGED)
4865 END_UNCHANGED = Z - GPT;
4866
4867 if (!encodep && coding->src_multibyte)
4868 {
4869 /* Decoding routines expects that the source text is unibyte.
4870 We must convert 8-bit characters of multibyte form to
4871 unibyte. */
4872 int len_byte_orig = len_byte;
4873 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4874 if (len_byte < len_byte_orig)
4875 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4876 len_byte);
4877 coding->src_multibyte = 0;
4878 }
4879
4880 for (;;)
4881 {
4882 int result;
4883
4884 /* The buffer memory is now:
4885 +--------+converted-text+---------+-------original-text-------+---+
4886 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4887 |<---------------------- GAP ----------------------->| */
4888 src = GAP_END_ADDR - len_byte;
4889 dst = GPT_ADDR + inserted_byte;
4890
4891 if (encodep)
4892 result = encode_coding (coding, src, dst, len_byte, 0);
4893 else
4894 result = decode_coding (coding, src, dst, len_byte, 0);
4895
4896 /* The buffer memory is now:
4897 +--------+-------converted-text----+--+------original-text----+---+
4898 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4899 |<---------------------- GAP ----------------------->| */
4900
4901 inserted += coding->produced_char;
4902 inserted_byte += coding->produced;
4903 len_byte -= coding->consumed;
4904
4905 if (result == CODING_FINISH_INSUFFICIENT_CMP)
4906 {
4907 coding_allocate_composition_data (coding, from + inserted);
4908 continue;
4909 }
4910
4911 src += coding->consumed;
4912 dst += coding->produced;
4913
4914 if (result == CODING_FINISH_NORMAL)
4915 {
4916 src += len_byte;
4917 break;
4918 }
4919 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4920 {
4921 unsigned char *pend = dst, *p = pend - inserted_byte;
4922 Lisp_Object eol_type;
4923
4924 /* Encode LFs back to the original eol format (CR or CRLF). */
4925 if (coding->eol_type == CODING_EOL_CR)
4926 {
4927 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4928 }
4929 else
4930 {
4931 int count = 0;
4932
4933 while (p < pend) if (*p++ == '\n') count++;
4934 if (src - dst < count)
4935 {
4936 /* We don't have sufficient room for encoding LFs
4937 back to CRLF. We must record converted and
4938 not-yet-converted text back to the buffer
4939 content, enlarge the gap, then record them out of
4940 the buffer contents again. */
4941 int add = len_byte + inserted_byte;
4942
4943 GAP_SIZE -= add;
4944 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4945 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4946 make_gap (count - GAP_SIZE);
4947 GAP_SIZE += add;
4948 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4949 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4950 /* Don't forget to update SRC, DST, and PEND. */
4951 src = GAP_END_ADDR - len_byte;
4952 dst = GPT_ADDR + inserted_byte;
4953 pend = dst;
4954 }
4955 inserted += count;
4956 inserted_byte += count;
4957 coding->produced += count;
4958 p = dst = pend + count;
4959 while (count)
4960 {
4961 *--p = *--pend;
4962 if (*p == '\n') count--, *--p = '\r';
4963 }
4964 }
4965
4966 /* Suppress eol-format conversion in the further conversion. */
4967 coding->eol_type = CODING_EOL_LF;
4968
4969 /* Set the coding system symbol to that for Unix-like EOL. */
4970 eol_type = Fget (saved_coding_symbol, Qeol_type);
4971 if (VECTORP (eol_type)
4972 && XVECTOR (eol_type)->size == 3
4973 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4974 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4975 else
4976 coding->symbol = saved_coding_symbol;
4977
4978 continue;
4979 }
4980 if (len_byte <= 0)
4981 {
4982 if (coding->type != coding_type_ccl
4983 || coding->mode & CODING_MODE_LAST_BLOCK)
4984 break;
4985 coding->mode |= CODING_MODE_LAST_BLOCK;
4986 continue;
4987 }
4988 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4989 {
4990 /* The source text ends in invalid codes. Let's just
4991 make them valid buffer contents, and finish conversion. */
4992 inserted += len_byte;
4993 inserted_byte += len_byte;
4994 while (len_byte--)
4995 *dst++ = *src++;
4996 break;
4997 }
4998 if (result == CODING_FINISH_INTERRUPT)
4999 {
5000 /* The conversion procedure was interrupted by a user. */
5001 break;
5002 }
5003 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5004 if (coding->consumed < 1)
5005 {
5006 /* It's quite strange to require more memory without
5007 consuming any bytes. Perhaps CCL program bug. */
5008 break;
5009 }
5010 if (first)
5011 {
5012 /* We have just done the first batch of conversion which was
5013 stoped because of insufficient gap. Let's reconsider the
5014 required gap size (i.e. SRT - DST) now.
5015
5016 We have converted ORIG bytes (== coding->consumed) into
5017 NEW bytes (coding->produced). To convert the remaining
5018 LEN bytes, we may need REQUIRE bytes of gap, where:
5019 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5020 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5021 Here, we are sure that NEW >= ORIG. */
5022 float ratio = coding->produced - coding->consumed;
5023 ratio /= coding->consumed;
5024 require = len_byte * ratio;
5025 first = 0;
5026 }
5027 if ((src - dst) < (require + 2000))
5028 {
5029 /* See the comment above the previous call of make_gap. */
5030 int add = len_byte + inserted_byte;
5031
5032 GAP_SIZE -= add;
5033 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5034 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5035 make_gap (require + 2000);
5036 GAP_SIZE += add;
5037 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5038 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5039 }
5040 }
5041 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5042
5043 if (encodep && coding->dst_multibyte)
5044 {
5045 /* The output is unibyte. We must convert 8-bit characters to
5046 multibyte form. */
5047 if (inserted_byte * 2 > GAP_SIZE)
5048 {
5049 GAP_SIZE -= inserted_byte;
5050 ZV += inserted_byte; Z += inserted_byte;
5051 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5052 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5053 make_gap (inserted_byte - GAP_SIZE);
5054 GAP_SIZE += inserted_byte;
5055 ZV -= inserted_byte; Z -= inserted_byte;
5056 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5057 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5058 }
5059 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5060 }
5061
5062 /* If we have shrinked the conversion area, adjust it now. */
5063 if (total_skip > 0)
5064 {
5065 if (tail_skip > 0)
5066 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5067 inserted += total_skip; inserted_byte += total_skip;
5068 GAP_SIZE += total_skip;
5069 GPT -= head_skip; GPT_BYTE -= head_skip;
5070 ZV -= total_skip; ZV_BYTE -= total_skip;
5071 Z -= total_skip; Z_BYTE -= total_skip;
5072 from -= head_skip; from_byte -= head_skip;
5073 to += tail_skip; to_byte += tail_skip;
5074 }
5075
5076 prev_Z = Z;
5077 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5078 inserted = Z - prev_Z;
5079
5080 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5081 coding_restore_composition (coding, Fcurrent_buffer ());
5082 coding_free_composition_data (coding);
5083
5084 if (! inhibit_pre_post_conversion
5085 && ! encodep && ! NILP (coding->post_read_conversion))
5086 {
5087 Lisp_Object val;
5088 int count = specpdl_ptr - specpdl;
5089
5090 if (from != PT)
5091 TEMP_SET_PT_BOTH (from, from_byte);
5092 prev_Z = Z;
5093 record_unwind_protect (code_convert_region_unwind, Qnil);
5094 /* We should not call any more pre-write/post-read-conversion
5095 functions while this post-read-conversion is running. */
5096 inhibit_pre_post_conversion = 1;
5097 val = call1 (coding->post_read_conversion, make_number (inserted));
5098 inhibit_pre_post_conversion = 0;
5099 /* Discard the unwind protect. */
5100 specpdl_ptr--;
5101 CHECK_NUMBER (val, 0);
5102 inserted += Z - prev_Z;
5103 }
5104
5105 if (orig_point >= from)
5106 {
5107 if (orig_point >= from + orig_len)
5108 orig_point += inserted - orig_len;
5109 else
5110 orig_point = from;
5111 TEMP_SET_PT (orig_point);
5112 }
5113
5114 if (replace)
5115 {
5116 signal_after_change (from, to - from, inserted);
5117 update_compositions (from, from + inserted, CHECK_BORDER);
5118 }
5119
5120 {
5121 coding->consumed = to_byte - from_byte;
5122 coding->consumed_char = to - from;
5123 coding->produced = inserted_byte;
5124 coding->produced_char = inserted;
5125 }
5126
5127 return 0;
5128 }
5129
5130 Lisp_Object
5131 run_pre_post_conversion_on_str (str, coding, encodep)
5132 Lisp_Object str;
5133 struct coding_system *coding;
5134 int encodep;
5135 {
5136 int count = specpdl_ptr - specpdl;
5137 struct gcpro gcpro1;
5138 struct buffer *prev = current_buffer;
5139 int multibyte = STRING_MULTIBYTE (str);
5140
5141 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5142 record_unwind_protect (code_convert_region_unwind, Qnil);
5143 GCPRO1 (str);
5144 temp_output_buffer_setup (" *code-converting-work*");
5145 set_buffer_internal (XBUFFER (Vstandard_output));
5146 /* We must insert the contents of STR as is without
5147 unibyte<->multibyte conversion. For that, we adjust the
5148 multibyteness of the working buffer to that of STR. */
5149 Ferase_buffer ();
5150 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5151 insert_from_string (str, 0, 0,
5152 XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5153 UNGCPRO;
5154 inhibit_pre_post_conversion = 1;
5155 if (encodep)
5156 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5157 else
5158 {
5159 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5160 call1 (coding->post_read_conversion, make_number (Z - BEG));
5161 }
5162 inhibit_pre_post_conversion = 0;
5163 str = make_buffer_string (BEG, Z, 0);
5164 return unbind_to (count, str);
5165 }
5166
5167 Lisp_Object
5168 decode_coding_string (str, coding, nocopy)
5169 Lisp_Object str;
5170 struct coding_system *coding;
5171 int nocopy;
5172 {
5173 int len;
5174 char *buf;
5175 int from, to, to_byte;
5176 struct gcpro gcpro1;
5177 Lisp_Object saved_coding_symbol;
5178 int result;
5179
5180 from = 0;
5181 to = XSTRING (str)->size;
5182 to_byte = STRING_BYTES (XSTRING (str));
5183
5184 saved_coding_symbol = Qnil;
5185 if (CODING_REQUIRE_DETECTION (coding))
5186 {
5187 /* See the comments in code_convert_region. */
5188 if (coding->type == coding_type_undecided)
5189 {
5190 detect_coding (coding, XSTRING (str)->data, to_byte);
5191 if (coding->type == coding_type_undecided)
5192 coding->type = coding_type_emacs_mule;
5193 }
5194 if (coding->eol_type == CODING_EOL_UNDECIDED
5195 && coding->type != coding_type_ccl)
5196 {
5197 saved_coding_symbol = coding->symbol;
5198 detect_eol (coding, XSTRING (str)->data, to_byte);
5199 if (coding->eol_type == CODING_EOL_UNDECIDED)
5200 coding->eol_type = CODING_EOL_LF;
5201 /* We had better recover the original eol format if we
5202 encounter an inconsitent eol format while decoding. */
5203 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5204 }
5205 }
5206
5207 if (! CODING_REQUIRE_DECODING (coding))
5208 {
5209 if (!STRING_MULTIBYTE (str))
5210 {
5211 str = Fstring_as_multibyte (str);
5212 nocopy = 1;
5213 }
5214 return (nocopy ? str : Fcopy_sequence (str));
5215 }
5216
5217 if (STRING_MULTIBYTE (str))
5218 {
5219 /* Decoding routines expect the source text to be unibyte. */
5220 str = Fstring_as_unibyte (str);
5221 to_byte = STRING_BYTES (XSTRING (str));
5222 nocopy = 1;
5223 coding->src_multibyte = 0;
5224 }
5225 coding->dst_multibyte = 1;
5226
5227 if (coding->composing != COMPOSITION_DISABLED)
5228 coding_allocate_composition_data (coding, from);
5229
5230 /* Try to skip the heading and tailing ASCIIs. */
5231 if (coding->type != coding_type_ccl)
5232 {
5233 int from_orig = from;
5234
5235 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5236 0);
5237 if (from == to_byte)
5238 return (nocopy ? str : Fcopy_sequence (str));
5239 }
5240
5241 len = decoding_buffer_size (coding, to_byte - from);
5242 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5243 GCPRO1 (str);
5244 buf = get_conversion_buffer (len);
5245 UNGCPRO;
5246
5247 if (from > 0)
5248 bcopy (XSTRING (str)->data, buf, from);
5249 result = decode_coding (coding, XSTRING (str)->data + from,
5250 buf + from, to_byte - from, len);
5251 if (result == CODING_FINISH_INCONSISTENT_EOL)
5252 {
5253 /* We simply try to decode the whole string again but without
5254 eol-conversion this time. */
5255 coding->eol_type = CODING_EOL_LF;
5256 coding->symbol = saved_coding_symbol;
5257 coding_free_composition_data (coding);
5258 return decode_coding_string (str, coding, nocopy);
5259 }
5260
5261 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5262 STRING_BYTES (XSTRING (str)) - to_byte);
5263
5264 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5265 str = make_multibyte_string (buf, len + coding->produced_char,
5266 len + coding->produced);
5267
5268 if (coding->cmp_data && coding->cmp_data->used)
5269 coding_restore_composition (coding, str);
5270 coding_free_composition_data (coding);
5271
5272 if (SYMBOLP (coding->post_read_conversion)
5273 && !NILP (Ffboundp (coding->post_read_conversion)))
5274 str = run_pre_post_conversion_on_str (str, coding, 0);
5275
5276 return str;
5277 }
5278
5279 Lisp_Object
5280 encode_coding_string (str, coding, nocopy)
5281 Lisp_Object str;
5282 struct coding_system *coding;
5283 int nocopy;
5284 {
5285 int len;
5286 char *buf;
5287 int from, to, to_byte;
5288 struct gcpro gcpro1;
5289 Lisp_Object saved_coding_symbol;
5290 int result;
5291
5292 if (SYMBOLP (coding->pre_write_conversion)
5293 && !NILP (Ffboundp (coding->pre_write_conversion)))
5294 str = run_pre_post_conversion_on_str (str, coding, 1);
5295
5296 from = 0;
5297 to = XSTRING (str)->size;
5298 to_byte = STRING_BYTES (XSTRING (str));
5299
5300 saved_coding_symbol = Qnil;
5301 if (! CODING_REQUIRE_ENCODING (coding))
5302 {
5303 if (STRING_MULTIBYTE (str))
5304 {
5305 str = Fstring_as_unibyte (str);
5306 nocopy = 1;
5307 }
5308 return (nocopy ? str : Fcopy_sequence (str));
5309 }
5310
5311 /* Encoding routines determine the multibyteness of the source text
5312 by coding->src_multibyte. */
5313 coding->src_multibyte = STRING_MULTIBYTE (str);
5314 coding->dst_multibyte = 0;
5315
5316 if (coding->composing != COMPOSITION_DISABLED)
5317 coding_save_composition (coding, from, to, str);
5318
5319 /* Try to skip the heading and tailing ASCIIs. */
5320 if (coding->type != coding_type_ccl)
5321 {
5322 int from_orig = from;
5323
5324 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5325 1);
5326 if (from == to_byte)
5327 return (nocopy ? str : Fcopy_sequence (str));
5328 }
5329
5330 len = encoding_buffer_size (coding, to_byte - from);
5331 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5332 GCPRO1 (str);
5333 buf = get_conversion_buffer (len);
5334 UNGCPRO;
5335
5336 if (from > 0)
5337 bcopy (XSTRING (str)->data, buf, from);
5338 result = encode_coding (coding, XSTRING (str)->data + from,
5339 buf + from, to_byte - from, len);
5340 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5341 STRING_BYTES (XSTRING (str)) - to_byte);
5342
5343 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5344 str = make_unibyte_string (buf, len + coding->produced);
5345 coding_free_composition_data (coding);
5346
5347 return str;
5348 }
5349
5350 \f
5351 #ifdef emacs
5352 /*** 8. Emacs Lisp library functions ***/
5353
5354 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5355 "Return t if OBJECT is nil or a coding-system.\n\
5356 See the documentation of `make-coding-system' for information\n\
5357 about coding-system objects.")
5358 (obj)
5359 Lisp_Object obj;
5360 {
5361 if (NILP (obj))
5362 return Qt;
5363 if (!SYMBOLP (obj))
5364 return Qnil;
5365 /* Get coding-spec vector for OBJ. */
5366 obj = Fget (obj, Qcoding_system);
5367 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5368 ? Qt : Qnil);
5369 }
5370
5371 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5372 Sread_non_nil_coding_system, 1, 1, 0,
5373 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5374 (prompt)
5375 Lisp_Object prompt;
5376 {
5377 Lisp_Object val;
5378 do
5379 {
5380 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5381 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5382 }
5383 while (XSTRING (val)->size == 0);
5384 return (Fintern (val, Qnil));
5385 }
5386
5387 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5388 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5389 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5390 (prompt, default_coding_system)
5391 Lisp_Object prompt, default_coding_system;
5392 {
5393 Lisp_Object val;
5394 if (SYMBOLP (default_coding_system))
5395 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5396 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5397 Qt, Qnil, Qcoding_system_history,
5398 default_coding_system, Qnil);
5399 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5400 }
5401
5402 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5403 1, 1, 0,
5404 "Check validity of CODING-SYSTEM.\n\
5405 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5406 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5407 The value of property should be a vector of length 5.")
5408 (coding_system)
5409 Lisp_Object coding_system;
5410 {
5411 CHECK_SYMBOL (coding_system, 0);
5412 if (!NILP (Fcoding_system_p (coding_system)))
5413 return coding_system;
5414 while (1)
5415 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5416 }
5417 \f
5418 Lisp_Object
5419 detect_coding_system (src, src_bytes, highest)
5420 unsigned char *src;
5421 int src_bytes, highest;
5422 {
5423 int coding_mask, eol_type;
5424 Lisp_Object val, tmp;
5425 int dummy;
5426
5427 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5428 eol_type = detect_eol_type (src, src_bytes, &dummy);
5429 if (eol_type == CODING_EOL_INCONSISTENT)
5430 eol_type = CODING_EOL_UNDECIDED;
5431
5432 if (!coding_mask)
5433 {
5434 val = Qundecided;
5435 if (eol_type != CODING_EOL_UNDECIDED)
5436 {
5437 Lisp_Object val2;
5438 val2 = Fget (Qundecided, Qeol_type);
5439 if (VECTORP (val2))
5440 val = XVECTOR (val2)->contents[eol_type];
5441 }
5442 return (highest ? val : Fcons (val, Qnil));
5443 }
5444
5445 /* At first, gather possible coding systems in VAL. */
5446 val = Qnil;
5447 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5448 {
5449 Lisp_Object category_val, category_index;
5450
5451 category_index = Fget (XCAR (tmp), Qcoding_category_index);
5452 category_val = Fsymbol_value (XCAR (tmp));
5453 if (!NILP (category_val)
5454 && NATNUMP (category_index)
5455 && (coding_mask & (1 << XFASTINT (category_index))))
5456 {
5457 val = Fcons (category_val, val);
5458 if (highest)
5459 break;
5460 }
5461 }
5462 if (!highest)
5463 val = Fnreverse (val);
5464
5465 /* Then, replace the elements with subsidiary coding systems. */
5466 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5467 {
5468 if (eol_type != CODING_EOL_UNDECIDED
5469 && eol_type != CODING_EOL_INCONSISTENT)
5470 {
5471 Lisp_Object eol;
5472 eol = Fget (XCAR (tmp), Qeol_type);
5473 if (VECTORP (eol))
5474 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5475 }
5476 }
5477 return (highest ? XCAR (val) : val);
5478 }
5479
5480 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5481 2, 3, 0,
5482 "Detect coding system of the text in the region between START and END.\n\
5483 Return a list of possible coding systems ordered by priority.\n\
5484 \n\
5485 If only ASCII characters are found, it returns a list of single element\n\
5486 `undecided' or its subsidiary coding system according to a detected\n\
5487 end-of-line format.\n\
5488 \n\
5489 If optional argument HIGHEST is non-nil, return the coding system of\n\
5490 highest priority.")
5491 (start, end, highest)
5492 Lisp_Object start, end, highest;
5493 {
5494 int from, to;
5495 int from_byte, to_byte;
5496
5497 CHECK_NUMBER_COERCE_MARKER (start, 0);
5498 CHECK_NUMBER_COERCE_MARKER (end, 1);
5499
5500 validate_region (&start, &end);
5501 from = XINT (start), to = XINT (end);
5502 from_byte = CHAR_TO_BYTE (from);
5503 to_byte = CHAR_TO_BYTE (to);
5504
5505 if (from < GPT && to >= GPT)
5506 move_gap_both (to, to_byte);
5507
5508 return detect_coding_system (BYTE_POS_ADDR (from_byte),
5509 to_byte - from_byte,
5510 !NILP (highest));
5511 }
5512
5513 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5514 1, 2, 0,
5515 "Detect coding system of the text in STRING.\n\
5516 Return a list of possible coding systems ordered by priority.\n\
5517 \n\
5518 If only ASCII characters are found, it returns a list of single element\n\
5519 `undecided' or its subsidiary coding system according to a detected\n\
5520 end-of-line format.\n\
5521 \n\
5522 If optional argument HIGHEST is non-nil, return the coding system of\n\
5523 highest priority.")
5524 (string, highest)
5525 Lisp_Object string, highest;
5526 {
5527 CHECK_STRING (string, 0);
5528
5529 return detect_coding_system (XSTRING (string)->data,
5530 STRING_BYTES (XSTRING (string)),
5531 !NILP (highest));
5532 }
5533
5534 Lisp_Object
5535 code_convert_region1 (start, end, coding_system, encodep)
5536 Lisp_Object start, end, coding_system;
5537 int encodep;
5538 {
5539 struct coding_system coding;
5540 int from, to, len;
5541
5542 CHECK_NUMBER_COERCE_MARKER (start, 0);
5543 CHECK_NUMBER_COERCE_MARKER (end, 1);
5544 CHECK_SYMBOL (coding_system, 2);
5545
5546 validate_region (&start, &end);
5547 from = XFASTINT (start);
5548 to = XFASTINT (end);
5549
5550 if (NILP (coding_system))
5551 return make_number (to - from);
5552
5553 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5554 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5555
5556 coding.mode |= CODING_MODE_LAST_BLOCK;
5557 coding.src_multibyte = coding.dst_multibyte
5558 = !NILP (current_buffer->enable_multibyte_characters);
5559 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5560 &coding, encodep, 1);
5561 Vlast_coding_system_used = coding.symbol;
5562 return make_number (coding.produced_char);
5563 }
5564
5565 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5566 3, 3, "r\nzCoding system: ",
5567 "Decode the current region by specified coding system.\n\
5568 When called from a program, takes three arguments:\n\
5569 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5570 This function sets `last-coding-system-used' to the precise coding system\n\
5571 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5572 not fully specified.)\n\
5573 It returns the length of the decoded text.")
5574 (start, end, coding_system)
5575 Lisp_Object start, end, coding_system;
5576 {
5577 return code_convert_region1 (start, end, coding_system, 0);
5578 }
5579
5580 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5581 3, 3, "r\nzCoding system: ",
5582 "Encode the current region by specified coding system.\n\
5583 When called from a program, takes three arguments:\n\
5584 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5585 This function sets `last-coding-system-used' to the precise coding system\n\
5586 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5587 not fully specified.)\n\
5588 It returns the length of the encoded text.")
5589 (start, end, coding_system)
5590 Lisp_Object start, end, coding_system;
5591 {
5592 return code_convert_region1 (start, end, coding_system, 1);
5593 }
5594
5595 Lisp_Object
5596 code_convert_string1 (string, coding_system, nocopy, encodep)
5597 Lisp_Object string, coding_system, nocopy;
5598 int encodep;
5599 {
5600 struct coding_system coding;
5601
5602 CHECK_STRING (string, 0);
5603 CHECK_SYMBOL (coding_system, 1);
5604
5605 if (NILP (coding_system))
5606 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5607
5608 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5609 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5610
5611 coding.mode |= CODING_MODE_LAST_BLOCK;
5612 string = (encodep
5613 ? encode_coding_string (string, &coding, !NILP (nocopy))
5614 : decode_coding_string (string, &coding, !NILP (nocopy)));
5615 Vlast_coding_system_used = coding.symbol;
5616
5617 return string;
5618 }
5619
5620 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5621 2, 3, 0,
5622 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5623 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5624 if the decoding operation is trivial.\n\
5625 This function sets `last-coding-system-used' to the precise coding system\n\
5626 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5627 not fully specified.)")
5628 (string, coding_system, nocopy)
5629 Lisp_Object string, coding_system, nocopy;
5630 {
5631 return code_convert_string1 (string, coding_system, nocopy, 0);
5632 }
5633
5634 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5635 2, 3, 0,
5636 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5637 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5638 if the encoding operation is trivial.\n\
5639 This function sets `last-coding-system-used' to the precise coding system\n\
5640 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5641 not fully specified.)")
5642 (string, coding_system, nocopy)
5643 Lisp_Object string, coding_system, nocopy;
5644 {
5645 return code_convert_string1 (string, coding_system, nocopy, 1);
5646 }
5647
5648 /* Encode or decode STRING according to CODING_SYSTEM.
5649 Do not set Vlast_coding_system_used.
5650
5651 This function is called only from macros DECODE_FILE and
5652 ENCODE_FILE, thus we ignore character composition. */
5653
5654 Lisp_Object
5655 code_convert_string_norecord (string, coding_system, encodep)
5656 Lisp_Object string, coding_system;
5657 int encodep;
5658 {
5659 struct coding_system coding;
5660
5661 CHECK_STRING (string, 0);
5662 CHECK_SYMBOL (coding_system, 1);
5663
5664 if (NILP (coding_system))
5665 return string;
5666
5667 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5668 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5669
5670 coding.composing = COMPOSITION_DISABLED;
5671 coding.mode |= CODING_MODE_LAST_BLOCK;
5672 return (encodep
5673 ? encode_coding_string (string, &coding, 1)
5674 : decode_coding_string (string, &coding, 1));
5675 }
5676 \f
5677 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5678 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5679 Return the corresponding character.")
5680 (code)
5681 Lisp_Object code;
5682 {
5683 unsigned char c1, c2, s1, s2;
5684 Lisp_Object val;
5685
5686 CHECK_NUMBER (code, 0);
5687 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5688 if (s1 == 0)
5689 {
5690 if (s2 < 0x80)
5691 XSETFASTINT (val, s2);
5692 else if (s2 >= 0xA0 || s2 <= 0xDF)
5693 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5694 else
5695 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5696 }
5697 else
5698 {
5699 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5700 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5701 error ("Invalid Shift JIS code: %x", XFASTINT (code));
5702 DECODE_SJIS (s1, s2, c1, c2);
5703 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5704 }
5705 return val;
5706 }
5707
5708 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5709 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5710 Return the corresponding code in SJIS.")
5711 (ch)
5712 Lisp_Object ch;
5713 {
5714 int charset, c1, c2, s1, s2;
5715 Lisp_Object val;
5716
5717 CHECK_NUMBER (ch, 0);
5718 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5719 if (charset == CHARSET_ASCII)
5720 {
5721 val = ch;
5722 }
5723 else if (charset == charset_jisx0208
5724 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5725 {
5726 ENCODE_SJIS (c1, c2, s1, s2);
5727 XSETFASTINT (val, (s1 << 8) | s2);
5728 }
5729 else if (charset == charset_katakana_jisx0201
5730 && c1 > 0x20 && c2 < 0xE0)
5731 {
5732 XSETFASTINT (val, c1 | 0x80);
5733 }
5734 else
5735 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5736 return val;
5737 }
5738
5739 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5740 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5741 Return the corresponding character.")
5742 (code)
5743 Lisp_Object code;
5744 {
5745 int charset;
5746 unsigned char b1, b2, c1, c2;
5747 Lisp_Object val;
5748
5749 CHECK_NUMBER (code, 0);
5750 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5751 if (b1 == 0)
5752 {
5753 if (b2 >= 0x80)
5754 error ("Invalid BIG5 code: %x", XFASTINT (code));
5755 val = code;
5756 }
5757 else
5758 {
5759 if ((b1 < 0xA1 || b1 > 0xFE)
5760 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5761 error ("Invalid BIG5 code: %x", XFASTINT (code));
5762 DECODE_BIG5 (b1, b2, charset, c1, c2);
5763 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5764 }
5765 return val;
5766 }
5767
5768 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5769 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5770 Return the corresponding character code in Big5.")
5771 (ch)
5772 Lisp_Object ch;
5773 {
5774 int charset, c1, c2, b1, b2;
5775 Lisp_Object val;
5776
5777 CHECK_NUMBER (ch, 0);
5778 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5779 if (charset == CHARSET_ASCII)
5780 {
5781 val = ch;
5782 }
5783 else if ((charset == charset_big5_1
5784 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5785 || (charset == charset_big5_2
5786 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5787 {
5788 ENCODE_BIG5 (charset, c1, c2, b1, b2);
5789 XSETFASTINT (val, (b1 << 8) | b2);
5790 }
5791 else
5792 error ("Can't encode to Big5: %d", XFASTINT (ch));
5793 return val;
5794 }
5795 \f
5796 DEFUN ("set-terminal-coding-system-internal",
5797 Fset_terminal_coding_system_internal,
5798 Sset_terminal_coding_system_internal, 1, 1, 0, "")
5799 (coding_system)
5800 Lisp_Object coding_system;
5801 {
5802 CHECK_SYMBOL (coding_system, 0);
5803 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5804 /* We had better not send unsafe characters to terminal. */
5805 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5806 /* Characer composition should be disabled. */
5807 terminal_coding.composing = COMPOSITION_DISABLED;
5808 terminal_coding.src_multibyte = 1;
5809 terminal_coding.dst_multibyte = 0;
5810 return Qnil;
5811 }
5812
5813 DEFUN ("set-safe-terminal-coding-system-internal",
5814 Fset_safe_terminal_coding_system_internal,
5815 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5816 (coding_system)
5817 Lisp_Object coding_system;
5818 {
5819 CHECK_SYMBOL (coding_system, 0);
5820 setup_coding_system (Fcheck_coding_system (coding_system),
5821 &safe_terminal_coding);
5822 /* Characer composition should be disabled. */
5823 safe_terminal_coding.composing = COMPOSITION_DISABLED;
5824 safe_terminal_coding.src_multibyte = 1;
5825 safe_terminal_coding.dst_multibyte = 0;
5826 return Qnil;
5827 }
5828
5829 DEFUN ("terminal-coding-system",
5830 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5831 "Return coding system specified for terminal output.")
5832 ()
5833 {
5834 return terminal_coding.symbol;
5835 }
5836
5837 DEFUN ("set-keyboard-coding-system-internal",
5838 Fset_keyboard_coding_system_internal,
5839 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5840 (coding_system)
5841 Lisp_Object coding_system;
5842 {
5843 CHECK_SYMBOL (coding_system, 0);
5844 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5845 /* Characer composition should be disabled. */
5846 keyboard_coding.composing = COMPOSITION_DISABLED;
5847 return Qnil;
5848 }
5849
5850 DEFUN ("keyboard-coding-system",
5851 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5852 "Return coding system specified for decoding keyboard input.")
5853 ()
5854 {
5855 return keyboard_coding.symbol;
5856 }
5857
5858 \f
5859 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5860 Sfind_operation_coding_system, 1, MANY, 0,
5861 "Choose a coding system for an operation based on the target name.\n\
5862 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5863 DECODING-SYSTEM is the coding system to use for decoding\n\
5864 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5865 for encoding (in case OPERATION does encoding).\n\
5866 \n\
5867 The first argument OPERATION specifies an I/O primitive:\n\
5868 For file I/O, `insert-file-contents' or `write-region'.\n\
5869 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5870 For network I/O, `open-network-stream'.\n\
5871 \n\
5872 The remaining arguments should be the same arguments that were passed\n\
5873 to the primitive. Depending on which primitive, one of those arguments\n\
5874 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5875 whichever argument specifies the file name is TARGET.\n\
5876 \n\
5877 TARGET has a meaning which depends on OPERATION:\n\
5878 For file I/O, TARGET is a file name.\n\
5879 For process I/O, TARGET is a process name.\n\
5880 For network I/O, TARGET is a service name or a port number\n\
5881 \n\
5882 This function looks up what specified for TARGET in,\n\
5883 `file-coding-system-alist', `process-coding-system-alist',\n\
5884 or `network-coding-system-alist' depending on OPERATION.\n\
5885 They may specify a coding system, a cons of coding systems,\n\
5886 or a function symbol to call.\n\
5887 In the last case, we call the function with one argument,\n\
5888 which is a list of all the arguments given to this function.")
5889 (nargs, args)
5890 int nargs;
5891 Lisp_Object *args;
5892 {
5893 Lisp_Object operation, target_idx, target, val;
5894 register Lisp_Object chain;
5895
5896 if (nargs < 2)
5897 error ("Too few arguments");
5898 operation = args[0];
5899 if (!SYMBOLP (operation)
5900 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5901 error ("Invalid first arguement");
5902 if (nargs < 1 + XINT (target_idx))
5903 error ("Too few arguments for operation: %s",
5904 XSYMBOL (operation)->name->data);
5905 target = args[XINT (target_idx) + 1];
5906 if (!(STRINGP (target)
5907 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5908 error ("Invalid %dth argument", XINT (target_idx) + 1);
5909
5910 chain = ((EQ (operation, Qinsert_file_contents)
5911 || EQ (operation, Qwrite_region))
5912 ? Vfile_coding_system_alist
5913 : (EQ (operation, Qopen_network_stream)
5914 ? Vnetwork_coding_system_alist
5915 : Vprocess_coding_system_alist));
5916 if (NILP (chain))
5917 return Qnil;
5918
5919 for (; CONSP (chain); chain = XCDR (chain))
5920 {
5921 Lisp_Object elt;
5922 elt = XCAR (chain);
5923
5924 if (CONSP (elt)
5925 && ((STRINGP (target)
5926 && STRINGP (XCAR (elt))
5927 && fast_string_match (XCAR (elt), target) >= 0)
5928 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5929 {
5930 val = XCDR (elt);
5931 /* Here, if VAL is both a valid coding system and a valid
5932 function symbol, we return VAL as a coding system. */
5933 if (CONSP (val))
5934 return val;
5935 if (! SYMBOLP (val))
5936 return Qnil;
5937 if (! NILP (Fcoding_system_p (val)))
5938 return Fcons (val, val);
5939 if (! NILP (Ffboundp (val)))
5940 {
5941 val = call1 (val, Flist (nargs, args));
5942 if (CONSP (val))
5943 return val;
5944 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5945 return Fcons (val, val);
5946 }
5947 return Qnil;
5948 }
5949 }
5950 return Qnil;
5951 }
5952
5953 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5954 Supdate_coding_systems_internal, 0, 0, 0,
5955 "Update internal database for ISO2022 and CCL based coding systems.\n\
5956 When values of any coding categories are changed, you must\n\
5957 call this function")
5958 ()
5959 {
5960 int i;
5961
5962 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5963 {
5964 Lisp_Object val;
5965
5966 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5967 if (!NILP (val))
5968 {
5969 if (! coding_system_table[i])
5970 coding_system_table[i] = ((struct coding_system *)
5971 xmalloc (sizeof (struct coding_system)));
5972 setup_coding_system (val, coding_system_table[i]);
5973 }
5974 else if (coding_system_table[i])
5975 {
5976 xfree (coding_system_table[i]);
5977 coding_system_table[i] = NULL;
5978 }
5979 }
5980
5981 return Qnil;
5982 }
5983
5984 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5985 Sset_coding_priority_internal, 0, 0, 0,
5986 "Update internal database for the current value of `coding-category-list'.\n\
5987 This function is internal use only.")
5988 ()
5989 {
5990 int i = 0, idx;
5991 Lisp_Object val;
5992
5993 val = Vcoding_category_list;
5994
5995 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5996 {
5997 if (! SYMBOLP (XCAR (val)))
5998 break;
5999 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6000 if (idx >= CODING_CATEGORY_IDX_MAX)
6001 break;
6002 coding_priorities[i++] = (1 << idx);
6003 val = XCDR (val);
6004 }
6005 /* If coding-category-list is valid and contains all coding
6006 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
6007 the following code saves Emacs from crashing. */
6008 while (i < CODING_CATEGORY_IDX_MAX)
6009 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6010
6011 return Qnil;
6012 }
6013
6014 #endif /* emacs */
6015
6016 \f
6017 /*** 9. Post-amble ***/
6018
6019 void
6020 init_coding ()
6021 {
6022 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6023 }
6024
6025 void
6026 init_coding_once ()
6027 {
6028 int i;
6029
6030 /* Emacs' internal format specific initialize routine. */
6031 for (i = 0; i <= 0x20; i++)
6032 emacs_code_class[i] = EMACS_control_code;
6033 emacs_code_class[0x0A] = EMACS_linefeed_code;
6034 emacs_code_class[0x0D] = EMACS_carriage_return_code;
6035 for (i = 0x21 ; i < 0x7F; i++)
6036 emacs_code_class[i] = EMACS_ascii_code;
6037 emacs_code_class[0x7F] = EMACS_control_code;
6038 for (i = 0x80; i < 0xFF; i++)
6039 emacs_code_class[i] = EMACS_invalid_code;
6040 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6041 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6042 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6043 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6044
6045 /* ISO2022 specific initialize routine. */
6046 for (i = 0; i < 0x20; i++)
6047 iso_code_class[i] = ISO_control_0;
6048 for (i = 0x21; i < 0x7F; i++)
6049 iso_code_class[i] = ISO_graphic_plane_0;
6050 for (i = 0x80; i < 0xA0; i++)
6051 iso_code_class[i] = ISO_control_1;
6052 for (i = 0xA1; i < 0xFF; i++)
6053 iso_code_class[i] = ISO_graphic_plane_1;
6054 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6055 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6056 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6057 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6058 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6059 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6060 iso_code_class[ISO_CODE_ESC] = ISO_escape;
6061 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6062 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6063 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6064
6065 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6066
6067 setup_coding_system (Qnil, &keyboard_coding);
6068 setup_coding_system (Qnil, &terminal_coding);
6069 setup_coding_system (Qnil, &safe_terminal_coding);
6070 setup_coding_system (Qnil, &default_buffer_file_coding);
6071
6072 bzero (coding_system_table, sizeof coding_system_table);
6073
6074 bzero (ascii_skip_code, sizeof ascii_skip_code);
6075 for (i = 0; i < 128; i++)
6076 ascii_skip_code[i] = 1;
6077
6078 #if defined (MSDOS) || defined (WINDOWSNT)
6079 system_eol_type = CODING_EOL_CRLF;
6080 #else
6081 system_eol_type = CODING_EOL_LF;
6082 #endif
6083
6084 inhibit_pre_post_conversion = 0;
6085 }
6086
6087 #ifdef emacs
6088
6089 void
6090 syms_of_coding ()
6091 {
6092 Qtarget_idx = intern ("target-idx");
6093 staticpro (&Qtarget_idx);
6094
6095 Qcoding_system_history = intern ("coding-system-history");
6096 staticpro (&Qcoding_system_history);
6097 Fset (Qcoding_system_history, Qnil);
6098
6099 /* Target FILENAME is the first argument. */
6100 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6101 /* Target FILENAME is the third argument. */
6102 Fput (Qwrite_region, Qtarget_idx, make_number (2));
6103
6104 Qcall_process = intern ("call-process");
6105 staticpro (&Qcall_process);
6106 /* Target PROGRAM is the first argument. */
6107 Fput (Qcall_process, Qtarget_idx, make_number (0));
6108
6109 Qcall_process_region = intern ("call-process-region");
6110 staticpro (&Qcall_process_region);
6111 /* Target PROGRAM is the third argument. */
6112 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6113
6114 Qstart_process = intern ("start-process");
6115 staticpro (&Qstart_process);
6116 /* Target PROGRAM is the third argument. */
6117 Fput (Qstart_process, Qtarget_idx, make_number (2));
6118
6119 Qopen_network_stream = intern ("open-network-stream");
6120 staticpro (&Qopen_network_stream);
6121 /* Target SERVICE is the fourth argument. */
6122 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6123
6124 Qcoding_system = intern ("coding-system");
6125 staticpro (&Qcoding_system);
6126
6127 Qeol_type = intern ("eol-type");
6128 staticpro (&Qeol_type);
6129
6130 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6131 staticpro (&Qbuffer_file_coding_system);
6132
6133 Qpost_read_conversion = intern ("post-read-conversion");
6134 staticpro (&Qpost_read_conversion);
6135
6136 Qpre_write_conversion = intern ("pre-write-conversion");
6137 staticpro (&Qpre_write_conversion);
6138
6139 Qno_conversion = intern ("no-conversion");
6140 staticpro (&Qno_conversion);
6141
6142 Qundecided = intern ("undecided");
6143 staticpro (&Qundecided);
6144
6145 Qcoding_system_p = intern ("coding-system-p");
6146 staticpro (&Qcoding_system_p);
6147
6148 Qcoding_system_error = intern ("coding-system-error");
6149 staticpro (&Qcoding_system_error);
6150
6151 Fput (Qcoding_system_error, Qerror_conditions,
6152 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6153 Fput (Qcoding_system_error, Qerror_message,
6154 build_string ("Invalid coding system"));
6155
6156 Qcoding_category = intern ("coding-category");
6157 staticpro (&Qcoding_category);
6158 Qcoding_category_index = intern ("coding-category-index");
6159 staticpro (&Qcoding_category_index);
6160
6161 Vcoding_category_table
6162 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6163 staticpro (&Vcoding_category_table);
6164 {
6165 int i;
6166 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6167 {
6168 XVECTOR (Vcoding_category_table)->contents[i]
6169 = intern (coding_category_name[i]);
6170 Fput (XVECTOR (Vcoding_category_table)->contents[i],
6171 Qcoding_category_index, make_number (i));
6172 }
6173 }
6174
6175 Qtranslation_table = intern ("translation-table");
6176 staticpro (&Qtranslation_table);
6177 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6178
6179 Qtranslation_table_id = intern ("translation-table-id");
6180 staticpro (&Qtranslation_table_id);
6181
6182 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6183 staticpro (&Qtranslation_table_for_decode);
6184
6185 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6186 staticpro (&Qtranslation_table_for_encode);
6187
6188 Qsafe_charsets = intern ("safe-charsets");
6189 staticpro (&Qsafe_charsets);
6190
6191 Qvalid_codes = intern ("valid-codes");
6192 staticpro (&Qvalid_codes);
6193
6194 Qemacs_mule = intern ("emacs-mule");
6195 staticpro (&Qemacs_mule);
6196
6197 Qraw_text = intern ("raw-text");
6198 staticpro (&Qraw_text);
6199
6200 defsubr (&Scoding_system_p);
6201 defsubr (&Sread_coding_system);
6202 defsubr (&Sread_non_nil_coding_system);
6203 defsubr (&Scheck_coding_system);
6204 defsubr (&Sdetect_coding_region);
6205 defsubr (&Sdetect_coding_string);
6206 defsubr (&Sdecode_coding_region);
6207 defsubr (&Sencode_coding_region);
6208 defsubr (&Sdecode_coding_string);
6209 defsubr (&Sencode_coding_string);
6210 defsubr (&Sdecode_sjis_char);
6211 defsubr (&Sencode_sjis_char);
6212 defsubr (&Sdecode_big5_char);
6213 defsubr (&Sencode_big5_char);
6214 defsubr (&Sset_terminal_coding_system_internal);
6215 defsubr (&Sset_safe_terminal_coding_system_internal);
6216 defsubr (&Sterminal_coding_system);
6217 defsubr (&Sset_keyboard_coding_system_internal);
6218 defsubr (&Skeyboard_coding_system);
6219 defsubr (&Sfind_operation_coding_system);
6220 defsubr (&Supdate_coding_systems_internal);
6221 defsubr (&Sset_coding_priority_internal);
6222
6223 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6224 "List of coding systems.\n\
6225 \n\
6226 Do not alter the value of this variable manually. This variable should be\n\
6227 updated by the functions `make-coding-system' and\n\
6228 `define-coding-system-alias'.");
6229 Vcoding_system_list = Qnil;
6230
6231 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6232 "Alist of coding system names.\n\
6233 Each element is one element list of coding system name.\n\
6234 This variable is given to `completing-read' as TABLE argument.\n\
6235 \n\
6236 Do not alter the value of this variable manually. This variable should be\n\
6237 updated by the functions `make-coding-system' and\n\
6238 `define-coding-system-alias'.");
6239 Vcoding_system_alist = Qnil;
6240
6241 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6242 "List of coding-categories (symbols) ordered by priority.");
6243 {
6244 int i;
6245
6246 Vcoding_category_list = Qnil;
6247 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6248 Vcoding_category_list
6249 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6250 Vcoding_category_list);
6251 }
6252
6253 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6254 "Specify the coding system for read operations.\n\
6255 It is useful to bind this variable with `let', but do not set it globally.\n\
6256 If the value is a coding system, it is used for decoding on read operation.\n\
6257 If not, an appropriate element is used from one of the coding system alists:\n\
6258 There are three such tables, `file-coding-system-alist',\n\
6259 `process-coding-system-alist', and `network-coding-system-alist'.");
6260 Vcoding_system_for_read = Qnil;
6261
6262 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6263 "Specify the coding system for write operations.\n\
6264 Programs bind this variable with `let', but you should not set it globally.\n\
6265 If the value is a coding system, it is used for encoding of output,\n\
6266 when writing it to a file and when sending it to a file or subprocess.\n\
6267 \n\
6268 If this does not specify a coding system, an appropriate element\n\
6269 is used from one of the coding system alists:\n\
6270 There are three such tables, `file-coding-system-alist',\n\
6271 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6272 For output to files, if the above procedure does not specify a coding system,\n\
6273 the value of `buffer-file-coding-system' is used.");
6274 Vcoding_system_for_write = Qnil;
6275
6276 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6277 "Coding system used in the latest file or process I/O.");
6278 Vlast_coding_system_used = Qnil;
6279
6280 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6281 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6282 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6283 such conversion.");
6284 inhibit_eol_conversion = 0;
6285
6286 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6287 "Non-nil means process buffer inherits coding system of process output.\n\
6288 Bind it to t if the process output is to be treated as if it were a file\n\
6289 read from some filesystem.");
6290 inherit_process_coding_system = 0;
6291
6292 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6293 "Alist to decide a coding system to use for a file I/O operation.\n\
6294 The format is ((PATTERN . VAL) ...),\n\
6295 where PATTERN is a regular expression matching a file name,\n\
6296 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6297 If VAL is a coding system, it is used for both decoding and encoding\n\
6298 the file contents.\n\
6299 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6300 and the cdr part is used for encoding.\n\
6301 If VAL is a function symbol, the function must return a coding system\n\
6302 or a cons of coding systems which are used as above.\n\
6303 \n\
6304 See also the function `find-operation-coding-system'\n\
6305 and the variable `auto-coding-alist'.");
6306 Vfile_coding_system_alist = Qnil;
6307
6308 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6309 "Alist to decide a coding system to use for a process I/O operation.\n\
6310 The format is ((PATTERN . VAL) ...),\n\
6311 where PATTERN is a regular expression matching a program name,\n\
6312 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6313 If VAL is a coding system, it is used for both decoding what received\n\
6314 from the program and encoding what sent to the program.\n\
6315 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6316 and the cdr part is used for encoding.\n\
6317 If VAL is a function symbol, the function must return a coding system\n\
6318 or a cons of coding systems which are used as above.\n\
6319 \n\
6320 See also the function `find-operation-coding-system'.");
6321 Vprocess_coding_system_alist = Qnil;
6322
6323 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6324 "Alist to decide a coding system to use for a network I/O operation.\n\
6325 The format is ((PATTERN . VAL) ...),\n\
6326 where PATTERN is a regular expression matching a network service name\n\
6327 or is a port number to connect to,\n\
6328 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6329 If VAL is a coding system, it is used for both decoding what received\n\
6330 from the network stream and encoding what sent to the network stream.\n\
6331 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6332 and the cdr part is used for encoding.\n\
6333 If VAL is a function symbol, the function must return a coding system\n\
6334 or a cons of coding systems which are used as above.\n\
6335 \n\
6336 See also the function `find-operation-coding-system'.");
6337 Vnetwork_coding_system_alist = Qnil;
6338
6339 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6340 "Coding system to use with system messages.");
6341 Vlocale_coding_system = Qnil;
6342
6343 /* The eol mnemonics are reset in startup.el system-dependently. */
6344 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6345 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6346 eol_mnemonic_unix = build_string (":");
6347
6348 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6349 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6350 eol_mnemonic_dos = build_string ("\\");
6351
6352 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6353 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6354 eol_mnemonic_mac = build_string ("/");
6355
6356 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6357 "*String displayed in mode line when end-of-line format is not yet determined.");
6358 eol_mnemonic_undecided = build_string (":");
6359
6360 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6361 "*Non-nil enables character translation while encoding and decoding.");
6362 Venable_character_translation = Qt;
6363
6364 DEFVAR_LISP ("standard-translation-table-for-decode",
6365 &Vstandard_translation_table_for_decode,
6366 "Table for translating characters while decoding.");
6367 Vstandard_translation_table_for_decode = Qnil;
6368
6369 DEFVAR_LISP ("standard-translation-table-for-encode",
6370 &Vstandard_translation_table_for_encode,
6371 "Table for translationg characters while encoding.");
6372 Vstandard_translation_table_for_encode = Qnil;
6373
6374 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6375 "Alist of charsets vs revision numbers.\n\
6376 While encoding, if a charset (car part of an element) is found,\n\
6377 designate it with the escape sequence identifing revision (cdr part of the element).");
6378 Vcharset_revision_alist = Qnil;
6379
6380 DEFVAR_LISP ("default-process-coding-system",
6381 &Vdefault_process_coding_system,
6382 "Cons of coding systems used for process I/O by default.\n\
6383 The car part is used for decoding a process output,\n\
6384 the cdr part is used for encoding a text to be sent to a process.");
6385 Vdefault_process_coding_system = Qnil;
6386
6387 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6388 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6389 This is a vector of length 256.\n\
6390 If Nth element is non-nil, the existence of code N in a file\n\
6391 \(or output of subprocess) doesn't prevent it to be detected as\n\
6392 a coding system of ISO 2022 variant which has a flag\n\
6393 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6394 or reading output of a subprocess.\n\
6395 Only 128th through 159th elements has a meaning.");
6396 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6397
6398 DEFVAR_LISP ("select-safe-coding-system-function",
6399 &Vselect_safe_coding_system_function,
6400 "Function to call to select safe coding system for encoding a text.\n\
6401 \n\
6402 If set, this function is called to force a user to select a proper\n\
6403 coding system which can encode the text in the case that a default\n\
6404 coding system used in each operation can't encode the text.\n\
6405 \n\
6406 The default value is `select-safe-coding-system' (which see).");
6407 Vselect_safe_coding_system_function = Qnil;
6408
6409 }
6410
6411 char *
6412 emacs_strerror (error_number)
6413 int error_number;
6414 {
6415 char *str;
6416
6417 synchronize_system_messages_locale ();
6418 str = strerror (error_number);
6419
6420 if (! NILP (Vlocale_coding_system))
6421 {
6422 Lisp_Object dec = code_convert_string_norecord (build_string (str),
6423 Vlocale_coding_system,
6424 0);
6425 str = (char *) XSTRING (dec)->data;
6426 }
6427
6428 return str;
6429 }
6430
6431 #endif /* emacs */
6432