]> code.delx.au - gnu-emacs/blob - src/coding.c
(Fcheck_coding_system): Use xsignal1. Remove loop.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7
8 This file is part of GNU Emacs.
9
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
24
25 /*** TABLE OF CONTENTS ***
26
27 0. General comments
28 1. Preamble
29 2. Emacs' internal format (emacs-mule) handlers
30 3. ISO2022 handlers
31 4. Shift-JIS and BIG5 handlers
32 5. CCL handlers
33 6. End-of-line handlers
34 7. C library functions
35 8. Emacs Lisp library functions
36 9. Post-amble
37
38 */
39
40 /*** 0. General comments ***/
41
42
43 /*** GENERAL NOTE on CODING SYSTEMS ***
44
45 A coding system is an encoding mechanism for one or more character
46 sets. Here's a list of coding systems which Emacs can handle. When
47 we say "decode", it means converting some other coding system to
48 Emacs' internal format (emacs-mule), and when we say "encode",
49 it means converting the coding system emacs-mule to some other
50 coding system.
51
52 0. Emacs' internal format (emacs-mule)
53
54 Emacs itself holds a multi-lingual character in buffers and strings
55 in a special format. Details are described in section 2.
56
57 1. ISO2022
58
59 The most famous coding system for multiple character sets. X's
60 Compound Text, various EUCs (Extended Unix Code), and coding
61 systems used in Internet communication such as ISO-2022-JP are
62 all variants of ISO2022. Details are described in section 3.
63
64 2. SJIS (or Shift-JIS or MS-Kanji-Code)
65
66 A coding system to encode character sets: ASCII, JISX0201, and
67 JISX0208. Widely used for PC's in Japan. Details are described in
68 section 4.
69
70 3. BIG5
71
72 A coding system to encode the character sets ASCII and Big5. Widely
73 used for Chinese (mainly in Taiwan and Hong Kong). Details are
74 described in section 4. In this file, when we write "BIG5"
75 (all uppercase), we mean the coding system, and when we write
76 "Big5" (capitalized), we mean the character set.
77
78 4. Raw text
79
80 A coding system for text containing random 8-bit code. Emacs does
81 no code conversion on such text except for end-of-line format.
82
83 5. Other
84
85 If a user wants to read/write text encoded in a coding system not
86 listed above, he can supply a decoder and an encoder for it as CCL
87 (Code Conversion Language) programs. Emacs executes the CCL program
88 while reading/writing.
89
90 Emacs represents a coding system by a Lisp symbol that has a property
91 `coding-system'. But, before actually using the coding system, the
92 information about it is set in a structure of type `struct
93 coding_system' for rapid processing. See section 6 for more details.
94
95 */
96
97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
98
99 How end-of-line of text is encoded depends on the operating system.
100 For instance, Unix's format is just one byte of `line-feed' code,
101 whereas DOS's format is two-byte sequence of `carriage-return' and
102 `line-feed' codes. MacOS's format is usually one byte of
103 `carriage-return'.
104
105 Since text character encoding and end-of-line encoding are
106 independent, any coding system described above can have any
107 end-of-line format. So Emacs has information about end-of-line
108 format in each coding-system. See section 6 for more details.
109
110 */
111
112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
113
114 These functions check if a text between SRC and SRC_END is encoded
115 in the coding system category XXX. Each returns an integer value in
116 which appropriate flag bits for the category XXX are set. The flag
117 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
118 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
119 of the range 0x80..0x9F are in multibyte form. */
120 #if 0
121 int
122 detect_coding_emacs_mule (src, src_end, multibytep)
123 unsigned char *src, *src_end;
124 int multibytep;
125 {
126 ...
127 }
128 #endif
129
130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
131
132 These functions decode SRC_BYTES length of unibyte text at SOURCE
133 encoded in CODING to Emacs' internal format. The resulting
134 multibyte text goes to a place pointed to by DESTINATION, the length
135 of which should not exceed DST_BYTES.
136
137 These functions set the information about original and decoded texts
138 in the members `produced', `produced_char', `consumed', and
139 `consumed_char' of the structure *CODING. They also set the member
140 `result' to one of CODING_FINISH_XXX indicating how the decoding
141 finished.
142
143 DST_BYTES zero means that the source area and destination area are
144 overlapped, which means that we can produce a decoded text until it
145 reaches the head of the not-yet-decoded source text.
146
147 Below is a template for these functions. */
148 #if 0
149 static void
150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
151 struct coding_system *coding;
152 const unsigned char *source;
153 unsigned char *destination;
154 int src_bytes, dst_bytes;
155 {
156 ...
157 }
158 #endif
159
160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
161
162 These functions encode SRC_BYTES length text at SOURCE from Emacs'
163 internal multibyte format to CODING. The resulting unibyte text
164 goes to a place pointed to by DESTINATION, the length of which
165 should not exceed DST_BYTES.
166
167 These functions set the information about original and encoded texts
168 in the members `produced', `produced_char', `consumed', and
169 `consumed_char' of the structure *CODING. They also set the member
170 `result' to one of CODING_FINISH_XXX indicating how the encoding
171 finished.
172
173 DST_BYTES zero means that the source area and destination area are
174 overlapped, which means that we can produce encoded text until it
175 reaches at the head of the not-yet-encoded source text.
176
177 Below is a template for these functions. */
178 #if 0
179 static void
180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
181 struct coding_system *coding;
182 unsigned char *source, *destination;
183 int src_bytes, dst_bytes;
184 {
185 ...
186 }
187 #endif
188
189 /*** COMMONLY USED MACROS ***/
190
191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
192 get one, two, and three bytes from the source text respectively.
193 If there are not enough bytes in the source, they jump to
194 `label_end_of_loop'. The caller should set variables `coding',
195 `src' and `src_end' to appropriate pointer in advance. These
196 macros are called from decoding routines `decode_coding_XXX', thus
197 it is assumed that the source text is unibyte. */
198
199 #define ONE_MORE_BYTE(c1) \
200 do { \
201 if (src >= src_end) \
202 { \
203 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
204 goto label_end_of_loop; \
205 } \
206 c1 = *src++; \
207 } while (0)
208
209 #define TWO_MORE_BYTES(c1, c2) \
210 do { \
211 if (src + 1 >= src_end) \
212 { \
213 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
214 goto label_end_of_loop; \
215 } \
216 c1 = *src++; \
217 c2 = *src++; \
218 } while (0)
219
220
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
222 form if MULTIBYTEP is nonzero. */
223
224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
225 do { \
226 if (src >= src_end) \
227 { \
228 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
229 goto label_end_of_loop; \
230 } \
231 c1 = *src++; \
232 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
233 c1 = *src++ - 0x20; \
234 } while (0)
235
236 /* Set C to the next character at the source text pointed by `src'.
237 If there are not enough characters in the source, jump to
238 `label_end_of_loop'. The caller should set variables `coding'
239 `src', `src_end', and `translation_table' to appropriate pointers
240 in advance. This macro is used in encoding routines
241 `encode_coding_XXX', thus it assumes that the source text is in
242 multibyte form except for 8-bit characters. 8-bit characters are
243 in multibyte form if coding->src_multibyte is nonzero, else they
244 are represented by a single byte. */
245
246 #define ONE_MORE_CHAR(c) \
247 do { \
248 int len = src_end - src; \
249 int bytes; \
250 if (len <= 0) \
251 { \
252 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
253 goto label_end_of_loop; \
254 } \
255 if (coding->src_multibyte \
256 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
257 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
258 else \
259 c = *src, bytes = 1; \
260 if (!NILP (translation_table)) \
261 c = translate_char (translation_table, c, -1, 0, 0); \
262 src += bytes; \
263 } while (0)
264
265
266 /* Produce a multibyte form of character C to `dst'. Jump to
267 `label_end_of_loop' if there's not enough space at `dst'.
268
269 If we are now in the middle of a composition sequence, the decoded
270 character may be ALTCHAR (for the current composition). In that
271 case, the character goes to coding->cmp_data->data instead of
272 `dst'.
273
274 This macro is used in decoding routines. */
275
276 #define EMIT_CHAR(c) \
277 do { \
278 if (! COMPOSING_P (coding) \
279 || coding->composing == COMPOSITION_RELATIVE \
280 || coding->composing == COMPOSITION_WITH_RULE) \
281 { \
282 int bytes = CHAR_BYTES (c); \
283 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
284 { \
285 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
286 goto label_end_of_loop; \
287 } \
288 dst += CHAR_STRING (c, dst); \
289 coding->produced_char++; \
290 } \
291 \
292 if (COMPOSING_P (coding) \
293 && coding->composing != COMPOSITION_RELATIVE) \
294 { \
295 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
296 coding->composition_rule_follows \
297 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
298 } \
299 } while (0)
300
301
302 #define EMIT_ONE_BYTE(c) \
303 do { \
304 if (dst >= (dst_bytes ? dst_end : src)) \
305 { \
306 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
307 goto label_end_of_loop; \
308 } \
309 *dst++ = c; \
310 } while (0)
311
312 #define EMIT_TWO_BYTES(c1, c2) \
313 do { \
314 if (dst + 2 > (dst_bytes ? dst_end : src)) \
315 { \
316 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
317 goto label_end_of_loop; \
318 } \
319 *dst++ = c1, *dst++ = c2; \
320 } while (0)
321
322 #define EMIT_BYTES(from, to) \
323 do { \
324 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
325 { \
326 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
327 goto label_end_of_loop; \
328 } \
329 while (from < to) \
330 *dst++ = *from++; \
331 } while (0)
332
333 \f
334 /*** 1. Preamble ***/
335
336 #ifdef emacs
337 #include <config.h>
338 #endif
339
340 #include <stdio.h>
341
342 #ifdef emacs
343
344 #include "lisp.h"
345 #include "buffer.h"
346 #include "charset.h"
347 #include "composite.h"
348 #include "ccl.h"
349 #include "coding.h"
350 #include "window.h"
351 #include "intervals.h"
352
353 #else /* not emacs */
354
355 #include "mulelib.h"
356
357 #endif /* not emacs */
358
359 Lisp_Object Qcoding_system, Qeol_type;
360 Lisp_Object Qbuffer_file_coding_system;
361 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
362 Lisp_Object Qno_conversion, Qundecided;
363 Lisp_Object Qcoding_system_history;
364 Lisp_Object Qsafe_chars;
365 Lisp_Object Qvalid_codes;
366 Lisp_Object Qascii_incompatible;
367
368 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
369 Lisp_Object Qcall_process, Qcall_process_region;
370 Lisp_Object Qstart_process, Qopen_network_stream;
371 Lisp_Object Qtarget_idx;
372
373 /* If a symbol has this property, evaluate the value to define the
374 symbol as a coding system. */
375 Lisp_Object Qcoding_system_define_form;
376
377 Lisp_Object Vselect_safe_coding_system_function;
378
379 int coding_system_require_warning;
380
381 /* Mnemonic string for each format of end-of-line. */
382 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
383 /* Mnemonic string to indicate format of end-of-line is not yet
384 decided. */
385 Lisp_Object eol_mnemonic_undecided;
386
387 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
388 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
389 This has an effect only for external encoding (i.e. for output to
390 file and process), not for in-buffer or Lisp string encoding. */
391 int system_eol_type;
392
393 #ifdef emacs
394
395 /* Information about which coding system is safe for which chars.
396 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
397
398 GENERIC-LIST is a list of generic coding systems which can encode
399 any characters.
400
401 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
402 corresponding char table that contains safe chars. */
403 Lisp_Object Vcoding_system_safe_chars;
404
405 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
406
407 Lisp_Object Qcoding_system_p, Qcoding_system_error;
408
409 /* Coding system emacs-mule and raw-text are for converting only
410 end-of-line format. */
411 Lisp_Object Qemacs_mule, Qraw_text;
412
413 Lisp_Object Qutf_8;
414
415 /* Coding-systems are handed between Emacs Lisp programs and C internal
416 routines by the following three variables. */
417 /* Coding-system for reading files and receiving data from process. */
418 Lisp_Object Vcoding_system_for_read;
419 /* Coding-system for writing files and sending data to process. */
420 Lisp_Object Vcoding_system_for_write;
421 /* Coding-system actually used in the latest I/O. */
422 Lisp_Object Vlast_coding_system_used;
423
424 /* A vector of length 256 which contains information about special
425 Latin codes (especially for dealing with Microsoft codes). */
426 Lisp_Object Vlatin_extra_code_table;
427
428 /* Flag to inhibit code conversion of end-of-line format. */
429 int inhibit_eol_conversion;
430
431 /* Flag to inhibit ISO2022 escape sequence detection. */
432 int inhibit_iso_escape_detection;
433
434 /* Flag to make buffer-file-coding-system inherit from process-coding. */
435 int inherit_process_coding_system;
436
437 /* Coding system to be used to encode text for terminal display. */
438 struct coding_system terminal_coding;
439
440 /* Coding system to be used to encode text for terminal display when
441 terminal coding system is nil. */
442 struct coding_system safe_terminal_coding;
443
444 /* Coding system of what is sent from terminal keyboard. */
445 struct coding_system keyboard_coding;
446
447 /* Default coding system to be used to write a file. */
448 struct coding_system default_buffer_file_coding;
449
450 Lisp_Object Vfile_coding_system_alist;
451 Lisp_Object Vprocess_coding_system_alist;
452 Lisp_Object Vnetwork_coding_system_alist;
453
454 Lisp_Object Vlocale_coding_system;
455
456 #endif /* emacs */
457
458 Lisp_Object Qcoding_category, Qcoding_category_index;
459
460 /* List of symbols `coding-category-xxx' ordered by priority. */
461 Lisp_Object Vcoding_category_list;
462
463 /* Table of coding categories (Lisp symbols). */
464 Lisp_Object Vcoding_category_table;
465
466 /* Table of names of symbol for each coding-category. */
467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
468 "coding-category-emacs-mule",
469 "coding-category-sjis",
470 "coding-category-iso-7",
471 "coding-category-iso-7-tight",
472 "coding-category-iso-8-1",
473 "coding-category-iso-8-2",
474 "coding-category-iso-7-else",
475 "coding-category-iso-8-else",
476 "coding-category-ccl",
477 "coding-category-big5",
478 "coding-category-utf-8",
479 "coding-category-utf-16-be",
480 "coding-category-utf-16-le",
481 "coding-category-raw-text",
482 "coding-category-binary"
483 };
484
485 /* Table of pointers to coding systems corresponding to each coding
486 categories. */
487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
488
489 /* Table of coding category masks. Nth element is a mask for a coding
490 category of which priority is Nth. */
491 static
492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
493
494 /* Flag to tell if we look up translation table on character code
495 conversion. */
496 Lisp_Object Venable_character_translation;
497 /* Standard translation table to look up on decoding (reading). */
498 Lisp_Object Vstandard_translation_table_for_decode;
499 /* Standard translation table to look up on encoding (writing). */
500 Lisp_Object Vstandard_translation_table_for_encode;
501
502 Lisp_Object Qtranslation_table;
503 Lisp_Object Qtranslation_table_id;
504 Lisp_Object Qtranslation_table_for_decode;
505 Lisp_Object Qtranslation_table_for_encode;
506
507 /* Alist of charsets vs revision number. */
508 Lisp_Object Vcharset_revision_alist;
509
510 /* Default coding systems used for process I/O. */
511 Lisp_Object Vdefault_process_coding_system;
512
513 /* Char table for translating Quail and self-inserting input. */
514 Lisp_Object Vtranslation_table_for_input;
515
516 /* Global flag to tell that we can't call post-read-conversion and
517 pre-write-conversion functions. Usually the value is zero, but it
518 is set to 1 temporarily while such functions are running. This is
519 to avoid infinite recursive call. */
520 static int inhibit_pre_post_conversion;
521
522 Lisp_Object Qchar_coding_system;
523
524 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
525 its validity. */
526
527 Lisp_Object
528 coding_safe_chars (coding_system)
529 Lisp_Object coding_system;
530 {
531 Lisp_Object coding_spec, plist, safe_chars;
532
533 coding_spec = Fget (coding_system, Qcoding_system);
534 plist = XVECTOR (coding_spec)->contents[3];
535 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
536 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
537 }
538
539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
540 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
541
542 \f
543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
544
545 /* Emacs' internal format for representation of multiple character
546 sets is a kind of multi-byte encoding, i.e. characters are
547 represented by variable-length sequences of one-byte codes.
548
549 ASCII characters and control characters (e.g. `tab', `newline') are
550 represented by one-byte sequences which are their ASCII codes, in
551 the range 0x00 through 0x7F.
552
553 8-bit characters of the range 0x80..0x9F are represented by
554 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
555 code + 0x20).
556
557 8-bit characters of the range 0xA0..0xFF are represented by
558 one-byte sequences which are their 8-bit code.
559
560 The other characters are represented by a sequence of `base
561 leading-code', optional `extended leading-code', and one or two
562 `position-code's. The length of the sequence is determined by the
563 base leading-code. Leading-code takes the range 0x81 through 0x9D,
564 whereas extended leading-code and position-code take the range 0xA0
565 through 0xFF. See `charset.h' for more details about leading-code
566 and position-code.
567
568 --- CODE RANGE of Emacs' internal format ---
569 character set range
570 ------------- -----
571 ascii 0x00..0x7F
572 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
573 eight-bit-graphic 0xA0..0xBF
574 ELSE 0x81..0x9D + [0xA0..0xFF]+
575 ---------------------------------------------
576
577 As this is the internal character representation, the format is
578 usually not used externally (i.e. in a file or in a data sent to a
579 process). But, it is possible to have a text externally in this
580 format (i.e. by encoding by the coding system `emacs-mule').
581
582 In that case, a sequence of one-byte codes has a slightly different
583 form.
584
585 Firstly, all characters in eight-bit-control are represented by
586 one-byte sequences which are their 8-bit code.
587
588 Next, character composition data are represented by the byte
589 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
590 where,
591 METHOD is 0xF0 plus one of composition method (enum
592 composition_method),
593
594 BYTES is 0xA0 plus the byte length of these composition data,
595
596 CHARS is 0xA0 plus the number of characters composed by these
597 data,
598
599 COMPONENTs are characters of multibyte form or composition
600 rules encoded by two-byte of ASCII codes.
601
602 In addition, for backward compatibility, the following formats are
603 also recognized as composition data on decoding.
604
605 0x80 MSEQ ...
606 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
607
608 Here,
609 MSEQ is a multibyte form but in these special format:
610 ASCII: 0xA0 ASCII_CODE+0x80,
611 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
612 RULE is a one byte code of the range 0xA0..0xF0 that
613 represents a composition rule.
614 */
615
616 enum emacs_code_class_type emacs_code_class[256];
617
618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
619 Check if a text is encoded in Emacs' internal format. If it is,
620 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
621
622 static int
623 detect_coding_emacs_mule (src, src_end, multibytep)
624 unsigned char *src, *src_end;
625 int multibytep;
626 {
627 unsigned char c;
628 int composing = 0;
629 /* Dummy for ONE_MORE_BYTE. */
630 struct coding_system dummy_coding;
631 struct coding_system *coding = &dummy_coding;
632
633 while (1)
634 {
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
636
637 if (composing)
638 {
639 if (c < 0xA0)
640 composing = 0;
641 else if (c == 0xA0)
642 {
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
644 c &= 0x7F;
645 }
646 else
647 c -= 0x20;
648 }
649
650 if (c < 0x20)
651 {
652 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
653 return 0;
654 }
655 else if (c >= 0x80 && c < 0xA0)
656 {
657 if (c == 0x80)
658 /* Old leading code for a composite character. */
659 composing = 1;
660 else
661 {
662 unsigned char *src_base = src - 1;
663 int bytes;
664
665 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
666 bytes))
667 return 0;
668 src = src_base + bytes;
669 }
670 }
671 }
672 label_end_of_loop:
673 return CODING_CATEGORY_MASK_EMACS_MULE;
674 }
675
676
677 /* Record the starting position START and METHOD of one composition. */
678
679 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
680 do { \
681 struct composition_data *cmp_data = coding->cmp_data; \
682 int *data = cmp_data->data + cmp_data->used; \
683 coding->cmp_data_start = cmp_data->used; \
684 data[0] = -1; \
685 data[1] = cmp_data->char_offset + start; \
686 data[3] = (int) method; \
687 cmp_data->used += 4; \
688 } while (0)
689
690 /* Record the ending position END of the current composition. */
691
692 #define CODING_ADD_COMPOSITION_END(coding, end) \
693 do { \
694 struct composition_data *cmp_data = coding->cmp_data; \
695 int *data = cmp_data->data + coding->cmp_data_start; \
696 data[0] = cmp_data->used - coding->cmp_data_start; \
697 data[2] = cmp_data->char_offset + end; \
698 } while (0)
699
700 /* Record one COMPONENT (alternate character or composition rule). */
701
702 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
703 do { \
704 coding->cmp_data->data[coding->cmp_data->used++] = component; \
705 if (coding->cmp_data->used - coding->cmp_data_start \
706 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
707 { \
708 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
709 coding->composing = COMPOSITION_NO; \
710 } \
711 } while (0)
712
713
714 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
715 is not less than SRC_END, return -1 without incrementing Src. */
716
717 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
718
719
720 /* Decode a character represented as a component of composition
721 sequence of Emacs 20 style at SRC. Set C to that character, store
722 its multibyte form sequence at P, and set P to the end of that
723 sequence. If no valid character is found, set C to -1. */
724
725 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
726 do { \
727 int bytes; \
728 \
729 c = SAFE_ONE_MORE_BYTE (); \
730 if (c < 0) \
731 break; \
732 if (CHAR_HEAD_P (c)) \
733 c = -1; \
734 else if (c == 0xA0) \
735 { \
736 c = SAFE_ONE_MORE_BYTE (); \
737 if (c < 0xA0) \
738 c = -1; \
739 else \
740 { \
741 c -= 0x80; \
742 *p++ = c; \
743 } \
744 } \
745 else if (BASE_LEADING_CODE_P (c - 0x20)) \
746 { \
747 unsigned char *p0 = p; \
748 \
749 c -= 0x20; \
750 *p++ = c; \
751 bytes = BYTES_BY_CHAR_HEAD (c); \
752 while (--bytes) \
753 { \
754 c = SAFE_ONE_MORE_BYTE (); \
755 if (c < 0) \
756 break; \
757 *p++ = c; \
758 } \
759 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
760 || (coding->flags /* We are recovering a file. */ \
761 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
762 && ! CHAR_HEAD_P (p0[1]))) \
763 c = STRING_CHAR (p0, bytes); \
764 else \
765 c = -1; \
766 } \
767 else \
768 c = -1; \
769 } while (0)
770
771
772 /* Decode a composition rule represented as a component of composition
773 sequence of Emacs 20 style at SRC. Set C to the rule. If not
774 valid rule is found, set C to -1. */
775
776 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
777 do { \
778 c = SAFE_ONE_MORE_BYTE (); \
779 c -= 0xA0; \
780 if (c < 0 || c >= 81) \
781 c = -1; \
782 else \
783 { \
784 gref = c / 9, nref = c % 9; \
785 c = COMPOSITION_ENCODE_RULE (gref, nref); \
786 } \
787 } while (0)
788
789
790 /* Decode composition sequence encoded by `emacs-mule' at the source
791 pointed by SRC. SRC_END is the end of source. Store information
792 of the composition in CODING->cmp_data.
793
794 For backward compatibility, decode also a composition sequence of
795 Emacs 20 style. In that case, the composition sequence contains
796 characters that should be extracted into a buffer or string. Store
797 those characters at *DESTINATION in multibyte form.
798
799 If we encounter an invalid byte sequence, return 0.
800 If we encounter an insufficient source or destination, or
801 insufficient space in CODING->cmp_data, return 1.
802 Otherwise, return consumed bytes in the source.
803
804 */
805 static INLINE int
806 decode_composition_emacs_mule (coding, src, src_end,
807 destination, dst_end, dst_bytes)
808 struct coding_system *coding;
809 const unsigned char *src, *src_end;
810 unsigned char **destination, *dst_end;
811 int dst_bytes;
812 {
813 unsigned char *dst = *destination;
814 int method, data_len, nchars;
815 const unsigned char *src_base = src++;
816 /* Store components of composition. */
817 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
818 int ncomponent;
819 /* Store multibyte form of characters to be composed. This is for
820 Emacs 20 style composition sequence. */
821 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
822 unsigned char *bufp = buf;
823 int c, i, gref, nref;
824
825 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
826 >= COMPOSITION_DATA_SIZE)
827 {
828 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
829 return -1;
830 }
831
832 ONE_MORE_BYTE (c);
833 if (c - 0xF0 >= COMPOSITION_RELATIVE
834 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
835 {
836 int with_rule;
837
838 method = c - 0xF0;
839 with_rule = (method == COMPOSITION_WITH_RULE
840 || method == COMPOSITION_WITH_RULE_ALTCHARS);
841 ONE_MORE_BYTE (c);
842 data_len = c - 0xA0;
843 if (data_len < 4
844 || src_base + data_len > src_end)
845 return 0;
846 ONE_MORE_BYTE (c);
847 nchars = c - 0xA0;
848 if (c < 1)
849 return 0;
850 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
851 {
852 /* If it is longer than this, it can't be valid. */
853 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
854 return 0;
855
856 if (ncomponent % 2 && with_rule)
857 {
858 ONE_MORE_BYTE (gref);
859 gref -= 32;
860 ONE_MORE_BYTE (nref);
861 nref -= 32;
862 c = COMPOSITION_ENCODE_RULE (gref, nref);
863 }
864 else
865 {
866 int bytes;
867 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
868 || (coding->flags /* We are recovering a file. */
869 && src[0] == LEADING_CODE_8_BIT_CONTROL
870 && ! CHAR_HEAD_P (src[1])))
871 c = STRING_CHAR (src, bytes);
872 else
873 c = *src, bytes = 1;
874 src += bytes;
875 }
876 component[ncomponent] = c;
877 }
878 }
879 else if (c >= 0x80)
880 {
881 /* This may be an old Emacs 20 style format. See the comment at
882 the section 2 of this file. */
883 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
884 if (src == src_end
885 && !(coding->mode & CODING_MODE_LAST_BLOCK))
886 goto label_end_of_loop;
887
888 src_end = src;
889 src = src_base + 1;
890 if (c < 0xC0)
891 {
892 method = COMPOSITION_RELATIVE;
893 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
894 {
895 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
896 if (c < 0)
897 break;
898 component[ncomponent++] = c;
899 }
900 if (ncomponent < 2)
901 return 0;
902 nchars = ncomponent;
903 }
904 else if (c == 0xFF)
905 {
906 method = COMPOSITION_WITH_RULE;
907 src++;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 return 0;
911 component[0] = c;
912 for (ncomponent = 1;
913 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
914 {
915 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
916 if (c < 0)
917 break;
918 component[ncomponent++] = c;
919 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
920 if (c < 0)
921 break;
922 component[ncomponent++] = c;
923 }
924 if (ncomponent < 3)
925 return 0;
926 nchars = (ncomponent + 1) / 2;
927 }
928 else
929 return 0;
930 }
931 else
932 return 0;
933
934 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
935 {
936 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
937 for (i = 0; i < ncomponent; i++)
938 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
939 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
940 if (buf < bufp)
941 {
942 unsigned char *p = buf;
943 EMIT_BYTES (p, bufp);
944 *destination += bufp - buf;
945 coding->produced_char += nchars;
946 }
947 return (src - src_base);
948 }
949 label_end_of_loop:
950 return -1;
951 }
952
953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
954
955 static void
956 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
957 struct coding_system *coding;
958 const unsigned char *source;
959 unsigned char *destination;
960 int src_bytes, dst_bytes;
961 {
962 const unsigned char *src = source;
963 const unsigned char *src_end = source + src_bytes;
964 unsigned char *dst = destination;
965 unsigned char *dst_end = destination + dst_bytes;
966 /* SRC_BASE remembers the start position in source in each loop.
967 The loop will be exited when there's not enough source code, or
968 when there's not enough destination area to produce a
969 character. */
970 const unsigned char *src_base;
971
972 coding->produced_char = 0;
973 while ((src_base = src) < src_end)
974 {
975 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
976 const unsigned char *p;
977 int bytes;
978
979 if (*src == '\r')
980 {
981 int c = *src++;
982
983 if (coding->eol_type == CODING_EOL_CR)
984 c = '\n';
985 else if (coding->eol_type == CODING_EOL_CRLF)
986 {
987 ONE_MORE_BYTE (c);
988 if (c != '\n')
989 {
990 src--;
991 c = '\r';
992 }
993 }
994 *dst++ = c;
995 coding->produced_char++;
996 continue;
997 }
998 else if (*src == '\n')
999 {
1000 if ((coding->eol_type == CODING_EOL_CR
1001 || coding->eol_type == CODING_EOL_CRLF)
1002 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1003 {
1004 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1005 goto label_end_of_loop;
1006 }
1007 *dst++ = *src++;
1008 coding->produced_char++;
1009 continue;
1010 }
1011 else if (*src == 0x80 && coding->cmp_data)
1012 {
1013 /* Start of composition data. */
1014 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1015 &dst, dst_end,
1016 dst_bytes);
1017 if (consumed < 0)
1018 goto label_end_of_loop;
1019 else if (consumed > 0)
1020 {
1021 src += consumed;
1022 continue;
1023 }
1024 bytes = CHAR_STRING (*src, tmp);
1025 p = tmp;
1026 src++;
1027 }
1028 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1029 || (coding->flags /* We are recovering a file. */
1030 && src[0] == LEADING_CODE_8_BIT_CONTROL
1031 && ! CHAR_HEAD_P (src[1])))
1032 {
1033 p = src;
1034 src += bytes;
1035 }
1036 else
1037 {
1038 int i, c;
1039
1040 bytes = BYTES_BY_CHAR_HEAD (*src);
1041 src++;
1042 for (i = 1; i < bytes; i++)
1043 {
1044 ONE_MORE_BYTE (c);
1045 if (CHAR_HEAD_P (c))
1046 break;
1047 }
1048 if (i < bytes)
1049 {
1050 bytes = CHAR_STRING (*src_base, tmp);
1051 p = tmp;
1052 src = src_base + 1;
1053 }
1054 else
1055 {
1056 p = src_base;
1057 }
1058 }
1059 if (dst + bytes >= (dst_bytes ? dst_end : src))
1060 {
1061 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1062 break;
1063 }
1064 while (bytes--) *dst++ = *p++;
1065 coding->produced_char++;
1066 }
1067 label_end_of_loop:
1068 coding->consumed = coding->consumed_char = src_base - source;
1069 coding->produced = dst - destination;
1070 }
1071
1072
1073 /* Encode composition data stored at DATA into a special byte sequence
1074 starting by 0x80. Update CODING->cmp_data_start and maybe
1075 CODING->cmp_data for the next call. */
1076
1077 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1078 do { \
1079 unsigned char buf[1024], *p0 = buf, *p; \
1080 int len = data[0]; \
1081 int i; \
1082 \
1083 buf[0] = 0x80; \
1084 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1085 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1086 p = buf + 4; \
1087 if (data[3] == COMPOSITION_WITH_RULE \
1088 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1089 { \
1090 p += CHAR_STRING (data[4], p); \
1091 for (i = 5; i < len; i += 2) \
1092 { \
1093 int gref, nref; \
1094 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1095 *p++ = 0x20 + gref; \
1096 *p++ = 0x20 + nref; \
1097 p += CHAR_STRING (data[i + 1], p); \
1098 } \
1099 } \
1100 else \
1101 { \
1102 for (i = 4; i < len; i++) \
1103 p += CHAR_STRING (data[i], p); \
1104 } \
1105 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1106 \
1107 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1108 { \
1109 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1110 goto label_end_of_loop; \
1111 } \
1112 while (p0 < p) \
1113 *dst++ = *p0++; \
1114 coding->cmp_data_start += data[0]; \
1115 if (coding->cmp_data_start == coding->cmp_data->used \
1116 && coding->cmp_data->next) \
1117 { \
1118 coding->cmp_data = coding->cmp_data->next; \
1119 coding->cmp_data_start = 0; \
1120 } \
1121 } while (0)
1122
1123
1124 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1125 unsigned char *, int, int));
1126
1127 static void
1128 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1129 struct coding_system *coding;
1130 const unsigned char *source;
1131 unsigned char *destination;
1132 int src_bytes, dst_bytes;
1133 {
1134 const unsigned char *src = source;
1135 const unsigned char *src_end = source + src_bytes;
1136 unsigned char *dst = destination;
1137 unsigned char *dst_end = destination + dst_bytes;
1138 const unsigned char *src_base;
1139 int c;
1140 int char_offset;
1141 int *data;
1142
1143 Lisp_Object translation_table;
1144
1145 translation_table = Qnil;
1146
1147 /* Optimization for the case that there's no composition. */
1148 if (!coding->cmp_data || coding->cmp_data->used == 0)
1149 {
1150 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1151 return;
1152 }
1153
1154 char_offset = coding->cmp_data->char_offset;
1155 data = coding->cmp_data->data + coding->cmp_data_start;
1156 while (1)
1157 {
1158 src_base = src;
1159
1160 /* If SRC starts a composition, encode the information about the
1161 composition in advance. */
1162 if (coding->cmp_data_start < coding->cmp_data->used
1163 && char_offset + coding->consumed_char == data[1])
1164 {
1165 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1166 char_offset = coding->cmp_data->char_offset;
1167 data = coding->cmp_data->data + coding->cmp_data_start;
1168 }
1169
1170 ONE_MORE_CHAR (c);
1171 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1172 || coding->eol_type == CODING_EOL_CR))
1173 {
1174 if (coding->eol_type == CODING_EOL_CRLF)
1175 EMIT_TWO_BYTES ('\r', c);
1176 else
1177 EMIT_ONE_BYTE ('\r');
1178 }
1179 else if (SINGLE_BYTE_CHAR_P (c))
1180 {
1181 if (coding->flags && ! ASCII_BYTE_P (c))
1182 {
1183 /* As we are auto saving, retain the multibyte form for
1184 8-bit chars. */
1185 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1186 int bytes = CHAR_STRING (c, buf);
1187
1188 if (bytes == 1)
1189 EMIT_ONE_BYTE (buf[0]);
1190 else
1191 EMIT_TWO_BYTES (buf[0], buf[1]);
1192 }
1193 else
1194 EMIT_ONE_BYTE (c);
1195 }
1196 else
1197 EMIT_BYTES (src_base, src);
1198 coding->consumed_char++;
1199 }
1200 label_end_of_loop:
1201 coding->consumed = src_base - source;
1202 coding->produced = coding->produced_char = dst - destination;
1203 return;
1204 }
1205
1206 \f
1207 /*** 3. ISO2022 handlers ***/
1208
1209 /* The following note describes the coding system ISO2022 briefly.
1210 Since the intention of this note is to help understand the
1211 functions in this file, some parts are NOT ACCURATE or are OVERLY
1212 SIMPLIFIED. For thorough understanding, please refer to the
1213 original document of ISO2022. This is equivalent to the standard
1214 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1215
1216 ISO2022 provides many mechanisms to encode several character sets
1217 in 7-bit and 8-bit environments. For 7-bit environments, all text
1218 is encoded using bytes less than 128. This may make the encoded
1219 text a little bit longer, but the text passes more easily through
1220 several types of gateway, some of which strip off the MSB (Most
1221 Significant Bit).
1222
1223 There are two kinds of character sets: control character sets and
1224 graphic character sets. The former contain control characters such
1225 as `newline' and `escape' to provide control functions (control
1226 functions are also provided by escape sequences). The latter
1227 contain graphic characters such as 'A' and '-'. Emacs recognizes
1228 two control character sets and many graphic character sets.
1229
1230 Graphic character sets are classified into one of the following
1231 four classes, according to the number of bytes (DIMENSION) and
1232 number of characters in one dimension (CHARS) of the set:
1233 - DIMENSION1_CHARS94
1234 - DIMENSION1_CHARS96
1235 - DIMENSION2_CHARS94
1236 - DIMENSION2_CHARS96
1237
1238 In addition, each character set is assigned an identification tag,
1239 unique for each set, called the "final character" (denoted as <F>
1240 hereafter). The <F> of each character set is decided by ECMA(*)
1241 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1242 (0x30..0x3F are for private use only).
1243
1244 Note (*): ECMA = European Computer Manufacturers Association
1245
1246 Here are examples of graphic character sets [NAME(<F>)]:
1247 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1248 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1249 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1250 o DIMENSION2_CHARS96 -- none for the moment
1251
1252 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1253 C0 [0x00..0x1F] -- control character plane 0
1254 GL [0x20..0x7F] -- graphic character plane 0
1255 C1 [0x80..0x9F] -- control character plane 1
1256 GR [0xA0..0xFF] -- graphic character plane 1
1257
1258 A control character set is directly designated and invoked to C0 or
1259 C1 by an escape sequence. The most common case is that:
1260 - ISO646's control character set is designated/invoked to C0, and
1261 - ISO6429's control character set is designated/invoked to C1,
1262 and usually these designations/invocations are omitted in encoded
1263 text. In a 7-bit environment, only C0 can be used, and a control
1264 character for C1 is encoded by an appropriate escape sequence to
1265 fit into the environment. All control characters for C1 are
1266 defined to have corresponding escape sequences.
1267
1268 A graphic character set is at first designated to one of four
1269 graphic registers (G0 through G3), then these graphic registers are
1270 invoked to GL or GR. These designations and invocations can be
1271 done independently. The most common case is that G0 is invoked to
1272 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1273 these invocations and designations are omitted in encoded text.
1274 In a 7-bit environment, only GL can be used.
1275
1276 When a graphic character set of CHARS94 is invoked to GL, codes
1277 0x20 and 0x7F of the GL area work as control characters SPACE and
1278 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1279 be used.
1280
1281 There are two ways of invocation: locking-shift and single-shift.
1282 With locking-shift, the invocation lasts until the next different
1283 invocation, whereas with single-shift, the invocation affects the
1284 following character only and doesn't affect the locking-shift
1285 state. Invocations are done by the following control characters or
1286 escape sequences:
1287
1288 ----------------------------------------------------------------------
1289 abbrev function cntrl escape seq description
1290 ----------------------------------------------------------------------
1291 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1292 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1293 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1294 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1295 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1296 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1297 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1298 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1299 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1300 ----------------------------------------------------------------------
1301 (*) These are not used by any known coding system.
1302
1303 Control characters for these functions are defined by macros
1304 ISO_CODE_XXX in `coding.h'.
1305
1306 Designations are done by the following escape sequences:
1307 ----------------------------------------------------------------------
1308 escape sequence description
1309 ----------------------------------------------------------------------
1310 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1311 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1312 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1313 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1314 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1315 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1316 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1317 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1318 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1319 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1320 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1321 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1322 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1323 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1324 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1325 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1326 ----------------------------------------------------------------------
1327
1328 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1329 of dimension 1, chars 94, and final character <F>, etc...
1330
1331 Note (*): Although these designations are not allowed in ISO2022,
1332 Emacs accepts them on decoding, and produces them on encoding
1333 CHARS96 character sets in a coding system which is characterized as
1334 7-bit environment, non-locking-shift, and non-single-shift.
1335
1336 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1337 '(' can be omitted. We refer to this as "short-form" hereafter.
1338
1339 Now you may notice that there are a lot of ways of encoding the
1340 same multilingual text in ISO2022. Actually, there exist many
1341 coding systems such as Compound Text (used in X11's inter client
1342 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1343 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1344 localized platforms), and all of these are variants of ISO2022.
1345
1346 In addition to the above, Emacs handles two more kinds of escape
1347 sequences: ISO6429's direction specification and Emacs' private
1348 sequence for specifying character composition.
1349
1350 ISO6429's direction specification takes the following form:
1351 o CSI ']' -- end of the current direction
1352 o CSI '0' ']' -- end of the current direction
1353 o CSI '1' ']' -- start of left-to-right text
1354 o CSI '2' ']' -- start of right-to-left text
1355 The control character CSI (0x9B: control sequence introducer) is
1356 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1357
1358 Character composition specification takes the following form:
1359 o ESC '0' -- start relative composition
1360 o ESC '1' -- end composition
1361 o ESC '2' -- start rule-base composition (*)
1362 o ESC '3' -- start relative composition with alternate chars (**)
1363 o ESC '4' -- start rule-base composition with alternate chars (**)
1364 Since these are not standard escape sequences of any ISO standard,
1365 the use of them with these meanings is restricted to Emacs only.
1366
1367 (*) This form is used only in Emacs 20.5 and older versions,
1368 but the newer versions can safely decode it.
1369 (**) This form is used only in Emacs 21.1 and newer versions,
1370 and the older versions can't decode it.
1371
1372 Here's a list of example usages of these composition escape
1373 sequences (categorized by `enum composition_method').
1374
1375 COMPOSITION_RELATIVE:
1376 ESC 0 CHAR [ CHAR ] ESC 1
1377 COMPOSITION_WITH_RULE:
1378 ESC 2 CHAR [ RULE CHAR ] ESC 1
1379 COMPOSITION_WITH_ALTCHARS:
1380 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1381 COMPOSITION_WITH_RULE_ALTCHARS:
1382 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1383
1384 enum iso_code_class_type iso_code_class[256];
1385
1386 #define CHARSET_OK(idx, charset, c) \
1387 (coding_system_table[idx] \
1388 && (charset == CHARSET_ASCII \
1389 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1390 CODING_SAFE_CHAR_P (safe_chars, c))) \
1391 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1392 charset) \
1393 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1394
1395 #define SHIFT_OUT_OK(idx) \
1396 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1397
1398 #define COMPOSITION_OK(idx) \
1399 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1400
1401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1402 Check if a text is encoded in ISO2022. If it is, return an
1403 integer in which appropriate flag bits any of:
1404 CODING_CATEGORY_MASK_ISO_7
1405 CODING_CATEGORY_MASK_ISO_7_TIGHT
1406 CODING_CATEGORY_MASK_ISO_8_1
1407 CODING_CATEGORY_MASK_ISO_8_2
1408 CODING_CATEGORY_MASK_ISO_7_ELSE
1409 CODING_CATEGORY_MASK_ISO_8_ELSE
1410 are set. If a code which should never appear in ISO2022 is found,
1411 returns 0. */
1412
1413 static int
1414 detect_coding_iso2022 (src, src_end, multibytep)
1415 unsigned char *src, *src_end;
1416 int multibytep;
1417 {
1418 int mask = CODING_CATEGORY_MASK_ISO;
1419 int mask_found = 0;
1420 int reg[4], shift_out = 0, single_shifting = 0;
1421 int c, c1, charset;
1422 /* Dummy for ONE_MORE_BYTE. */
1423 struct coding_system dummy_coding;
1424 struct coding_system *coding = &dummy_coding;
1425 Lisp_Object safe_chars;
1426
1427 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1428 while (mask && src < src_end)
1429 {
1430 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1431 retry:
1432 switch (c)
1433 {
1434 case ISO_CODE_ESC:
1435 if (inhibit_iso_escape_detection)
1436 break;
1437 single_shifting = 0;
1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1439 if (c >= '(' && c <= '/')
1440 {
1441 /* Designation sequence for a charset of dimension 1. */
1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1443 if (c1 < ' ' || c1 >= 0x80
1444 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1445 /* Invalid designation sequence. Just ignore. */
1446 break;
1447 reg[(c - '(') % 4] = charset;
1448 }
1449 else if (c == '$')
1450 {
1451 /* Designation sequence for a charset of dimension 2. */
1452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1453 if (c >= '@' && c <= 'B')
1454 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1455 reg[0] = charset = iso_charset_table[1][0][c];
1456 else if (c >= '(' && c <= '/')
1457 {
1458 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1459 if (c1 < ' ' || c1 >= 0x80
1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461 /* Invalid designation sequence. Just ignore. */
1462 break;
1463 reg[(c - '(') % 4] = charset;
1464 }
1465 else
1466 /* Invalid designation sequence. Just ignore. */
1467 break;
1468 }
1469 else if (c == 'N' || c == 'O')
1470 {
1471 /* ESC <Fe> for SS2 or SS3. */
1472 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1473 break;
1474 }
1475 else if (c >= '0' && c <= '4')
1476 {
1477 /* ESC <Fp> for start/end composition. */
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1498 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1499 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1500 else
1501 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1502 break;
1503 }
1504 else
1505 /* Invalid escape sequence. Just ignore. */
1506 break;
1507
1508 /* We found a valid designation sequence for CHARSET. */
1509 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1510 c = MAKE_CHAR (charset, 0, 0);
1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1523 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1524 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1525 else
1526 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1527 break;
1528
1529 case ISO_CODE_SO:
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 single_shifting = 0;
1533 if (shift_out == 0
1534 && (reg[1] >= 0
1535 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1536 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1537 {
1538 /* Locking shift out. */
1539 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541 }
1542 break;
1543
1544 case ISO_CODE_SI:
1545 if (inhibit_iso_escape_detection)
1546 break;
1547 single_shifting = 0;
1548 if (shift_out == 1)
1549 {
1550 /* Locking shift in. */
1551 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1552 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1553 }
1554 break;
1555
1556 case ISO_CODE_CSI:
1557 single_shifting = 0;
1558 case ISO_CODE_SS2:
1559 case ISO_CODE_SS3:
1560 {
1561 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1562
1563 if (inhibit_iso_escape_detection)
1564 break;
1565 if (c != ISO_CODE_CSI)
1566 {
1567 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568 & CODING_FLAG_ISO_SINGLE_SHIFT)
1569 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571 & CODING_FLAG_ISO_SINGLE_SHIFT)
1572 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573 single_shifting = 1;
1574 }
1575 if (VECTORP (Vlatin_extra_code_table)
1576 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1577 {
1578 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1579 & CODING_FLAG_ISO_LATIN_EXTRA)
1580 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1581 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1582 & CODING_FLAG_ISO_LATIN_EXTRA)
1583 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1584 }
1585 mask &= newmask;
1586 mask_found |= newmask;
1587 }
1588 break;
1589
1590 default:
1591 if (c < 0x80)
1592 {
1593 single_shifting = 0;
1594 break;
1595 }
1596 else if (c < 0xA0)
1597 {
1598 single_shifting = 0;
1599 if (VECTORP (Vlatin_extra_code_table)
1600 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1601 {
1602 int newmask = 0;
1603
1604 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1605 & CODING_FLAG_ISO_LATIN_EXTRA)
1606 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1607 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1608 & CODING_FLAG_ISO_LATIN_EXTRA)
1609 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1610 mask &= newmask;
1611 mask_found |= newmask;
1612 }
1613 else
1614 return 0;
1615 }
1616 else
1617 {
1618 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1619 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1620 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1621 /* Check the length of succeeding codes of the range
1622 0xA0..0FF. If the byte length is odd, we exclude
1623 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1624 when we are not single shifting. */
1625 if (!single_shifting
1626 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1627 {
1628 int i = 1;
1629
1630 c = -1;
1631 while (src < src_end)
1632 {
1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1634 if (c < 0xA0)
1635 break;
1636 i++;
1637 }
1638
1639 if (i & 1 && src < src_end)
1640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1641 else
1642 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1643 if (c >= 0)
1644 /* This means that we have read one extra byte. */
1645 goto retry;
1646 }
1647 }
1648 break;
1649 }
1650 }
1651 label_end_of_loop:
1652 return (mask & mask_found);
1653 }
1654
1655 /* Decode a character of which charset is CHARSET, the 1st position
1656 code is C1, the 2nd position code is C2, and return the decoded
1657 character code. If the variable `translation_table' is non-nil,
1658 returned the translated code. */
1659
1660 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1661 (NILP (translation_table) \
1662 ? MAKE_CHAR (charset, c1, c2) \
1663 : translate_char (translation_table, -1, charset, c1, c2))
1664
1665 /* Set designation state into CODING. */
1666 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1667 do { \
1668 int charset, c; \
1669 \
1670 if (final_char < '0' || final_char >= 128) \
1671 goto label_invalid_code; \
1672 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1673 make_number (chars), \
1674 make_number (final_char)); \
1675 c = MAKE_CHAR (charset, 0, 0); \
1676 if (charset >= 0 \
1677 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1678 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1679 { \
1680 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1681 && reg == 0 \
1682 && charset == CHARSET_ASCII) \
1683 { \
1684 /* We should insert this designation sequence as is so \
1685 that it is surely written back to a file. */ \
1686 coding->spec.iso2022.last_invalid_designation_register = -1; \
1687 goto label_invalid_code; \
1688 } \
1689 coding->spec.iso2022.last_invalid_designation_register = -1; \
1690 if ((coding->mode & CODING_MODE_DIRECTION) \
1691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1692 charset = CHARSET_REVERSE_CHARSET (charset); \
1693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1694 } \
1695 else \
1696 { \
1697 coding->spec.iso2022.last_invalid_designation_register = reg; \
1698 goto label_invalid_code; \
1699 } \
1700 } while (0)
1701
1702 /* Allocate a memory block for storing information about compositions.
1703 The block is chained to the already allocated blocks. */
1704
1705 void
1706 coding_allocate_composition_data (coding, char_offset)
1707 struct coding_system *coding;
1708 int char_offset;
1709 {
1710 struct composition_data *cmp_data
1711 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1712
1713 cmp_data->char_offset = char_offset;
1714 cmp_data->used = 0;
1715 cmp_data->prev = coding->cmp_data;
1716 cmp_data->next = NULL;
1717 if (coding->cmp_data)
1718 coding->cmp_data->next = cmp_data;
1719 coding->cmp_data = cmp_data;
1720 coding->cmp_data_start = 0;
1721 coding->composing = COMPOSITION_NO;
1722 }
1723
1724 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1725 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1726 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1727 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1728 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1729 */
1730
1731 #define DECODE_COMPOSITION_START(c1) \
1732 do { \
1733 if (coding->composing == COMPOSITION_DISABLED) \
1734 { \
1735 *dst++ = ISO_CODE_ESC; \
1736 *dst++ = c1 & 0x7f; \
1737 coding->produced_char += 2; \
1738 } \
1739 else if (!COMPOSING_P (coding)) \
1740 { \
1741 /* This is surely the start of a composition. We must be sure \
1742 that coding->cmp_data has enough space to store the \
1743 information about the composition. If not, terminate the \
1744 current decoding loop, allocate one more memory block for \
1745 coding->cmp_data in the caller, then start the decoding \
1746 loop again. We can't allocate memory here directly because \
1747 it may cause buffer/string relocation. */ \
1748 if (!coding->cmp_data \
1749 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1750 >= COMPOSITION_DATA_SIZE)) \
1751 { \
1752 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1753 goto label_end_of_loop; \
1754 } \
1755 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1756 : c1 == '2' ? COMPOSITION_WITH_RULE \
1757 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1758 : COMPOSITION_WITH_RULE_ALTCHARS); \
1759 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1760 coding->composing); \
1761 coding->composition_rule_follows = 0; \
1762 } \
1763 else \
1764 { \
1765 /* We are already handling a composition. If the method is \
1766 the following two, the codes following the current escape \
1767 sequence are actual characters stored in a buffer. */ \
1768 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1769 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1770 { \
1771 coding->composing = COMPOSITION_RELATIVE; \
1772 coding->composition_rule_follows = 0; \
1773 } \
1774 } \
1775 } while (0)
1776
1777 /* Handle composition end sequence ESC 1. */
1778
1779 #define DECODE_COMPOSITION_END(c1) \
1780 do { \
1781 if (! COMPOSING_P (coding)) \
1782 { \
1783 *dst++ = ISO_CODE_ESC; \
1784 *dst++ = c1; \
1785 coding->produced_char += 2; \
1786 } \
1787 else \
1788 { \
1789 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1790 coding->composing = COMPOSITION_NO; \
1791 } \
1792 } while (0)
1793
1794 /* Decode a composition rule from the byte C1 (and maybe one more byte
1795 from SRC) and store one encoded composition rule in
1796 coding->cmp_data. */
1797
1798 #define DECODE_COMPOSITION_RULE(c1) \
1799 do { \
1800 int rule = 0; \
1801 (c1) -= 32; \
1802 if (c1 < 81) /* old format (before ver.21) */ \
1803 { \
1804 int gref = (c1) / 9; \
1805 int nref = (c1) % 9; \
1806 if (gref == 4) gref = 10; \
1807 if (nref == 4) nref = 10; \
1808 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1809 } \
1810 else if (c1 < 93) /* new format (after ver.21) */ \
1811 { \
1812 ONE_MORE_BYTE (c2); \
1813 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1814 } \
1815 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1816 coding->composition_rule_follows = 0; \
1817 } while (0)
1818
1819
1820 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1821
1822 static void
1823 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1824 struct coding_system *coding;
1825 const unsigned char *source;
1826 unsigned char *destination;
1827 int src_bytes, dst_bytes;
1828 {
1829 const unsigned char *src = source;
1830 const unsigned char *src_end = source + src_bytes;
1831 unsigned char *dst = destination;
1832 unsigned char *dst_end = destination + dst_bytes;
1833 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1834 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1835 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1836 /* SRC_BASE remembers the start position in source in each loop.
1837 The loop will be exited when there's not enough source code
1838 (within macro ONE_MORE_BYTE), or when there's not enough
1839 destination area to produce a character (within macro
1840 EMIT_CHAR). */
1841 const unsigned char *src_base;
1842 int c, charset;
1843 Lisp_Object translation_table;
1844 Lisp_Object safe_chars;
1845
1846 safe_chars = coding_safe_chars (coding->symbol);
1847
1848 if (NILP (Venable_character_translation))
1849 translation_table = Qnil;
1850 else
1851 {
1852 translation_table = coding->translation_table_for_decode;
1853 if (NILP (translation_table))
1854 translation_table = Vstandard_translation_table_for_decode;
1855 }
1856
1857 coding->result = CODING_FINISH_NORMAL;
1858
1859 while (1)
1860 {
1861 int c1, c2 = 0;
1862
1863 src_base = src;
1864 ONE_MORE_BYTE (c1);
1865
1866 /* We produce no character or one character. */
1867 switch (iso_code_class [c1])
1868 {
1869 case ISO_0x20_or_0x7F:
1870 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1871 {
1872 DECODE_COMPOSITION_RULE (c1);
1873 continue;
1874 }
1875 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1876 {
1877 /* This is SPACE or DEL. */
1878 charset = CHARSET_ASCII;
1879 break;
1880 }
1881 /* This is a graphic character, we fall down ... */
1882
1883 case ISO_graphic_plane_0:
1884 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885 {
1886 DECODE_COMPOSITION_RULE (c1);
1887 continue;
1888 }
1889 charset = charset0;
1890 break;
1891
1892 case ISO_0xA0_or_0xFF:
1893 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1894 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1895 goto label_invalid_code;
1896 /* This is a graphic character, we fall down ... */
1897
1898 case ISO_graphic_plane_1:
1899 if (charset1 < 0)
1900 goto label_invalid_code;
1901 charset = charset1;
1902 break;
1903
1904 case ISO_control_0:
1905 if (COMPOSING_P (coding))
1906 DECODE_COMPOSITION_END ('1');
1907
1908 /* All ISO2022 control characters in this class have the
1909 same representation in Emacs internal format. */
1910 if (c1 == '\n'
1911 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1912 && (coding->eol_type == CODING_EOL_CR
1913 || coding->eol_type == CODING_EOL_CRLF))
1914 {
1915 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1916 goto label_end_of_loop;
1917 }
1918 charset = CHARSET_ASCII;
1919 break;
1920
1921 case ISO_control_1:
1922 if (COMPOSING_P (coding))
1923 DECODE_COMPOSITION_END ('1');
1924 goto label_invalid_code;
1925
1926 case ISO_carriage_return:
1927 if (COMPOSING_P (coding))
1928 DECODE_COMPOSITION_END ('1');
1929
1930 if (coding->eol_type == CODING_EOL_CR)
1931 c1 = '\n';
1932 else if (coding->eol_type == CODING_EOL_CRLF)
1933 {
1934 ONE_MORE_BYTE (c1);
1935 if (c1 != ISO_CODE_LF)
1936 {
1937 src--;
1938 c1 = '\r';
1939 }
1940 }
1941 charset = CHARSET_ASCII;
1942 break;
1943
1944 case ISO_shift_out:
1945 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1946 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1947 goto label_invalid_code;
1948 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1949 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950 continue;
1951
1952 case ISO_shift_in:
1953 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1954 goto label_invalid_code;
1955 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1956 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1957 continue;
1958
1959 case ISO_single_shift_2_7:
1960 case ISO_single_shift_2:
1961 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962 goto label_invalid_code;
1963 /* SS2 is handled as an escape sequence of ESC 'N' */
1964 c1 = 'N';
1965 goto label_escape_sequence;
1966
1967 case ISO_single_shift_3:
1968 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1969 goto label_invalid_code;
1970 /* SS2 is handled as an escape sequence of ESC 'O' */
1971 c1 = 'O';
1972 goto label_escape_sequence;
1973
1974 case ISO_control_sequence_introducer:
1975 /* CSI is handled as an escape sequence of ESC '[' ... */
1976 c1 = '[';
1977 goto label_escape_sequence;
1978
1979 case ISO_escape:
1980 ONE_MORE_BYTE (c1);
1981 label_escape_sequence:
1982 /* Escape sequences handled by Emacs are invocation,
1983 designation, direction specification, and character
1984 composition specification. */
1985 switch (c1)
1986 {
1987 case '&': /* revision of following character set */
1988 ONE_MORE_BYTE (c1);
1989 if (!(c1 >= '@' && c1 <= '~'))
1990 goto label_invalid_code;
1991 ONE_MORE_BYTE (c1);
1992 if (c1 != ISO_CODE_ESC)
1993 goto label_invalid_code;
1994 ONE_MORE_BYTE (c1);
1995 goto label_escape_sequence;
1996
1997 case '$': /* designation of 2-byte character set */
1998 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999 goto label_invalid_code;
2000 ONE_MORE_BYTE (c1);
2001 if (c1 >= '@' && c1 <= 'B')
2002 { /* designation of JISX0208.1978, GB2312.1980,
2003 or JISX0208.1980 */
2004 DECODE_DESIGNATION (0, 2, 94, c1);
2005 }
2006 else if (c1 >= 0x28 && c1 <= 0x2B)
2007 { /* designation of DIMENSION2_CHARS94 character set */
2008 ONE_MORE_BYTE (c2);
2009 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2010 }
2011 else if (c1 >= 0x2C && c1 <= 0x2F)
2012 { /* designation of DIMENSION2_CHARS96 character set */
2013 ONE_MORE_BYTE (c2);
2014 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2015 }
2016 else
2017 goto label_invalid_code;
2018 /* We must update these variables now. */
2019 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2021 continue;
2022
2023 case 'n': /* invocation of locking-shift-2 */
2024 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2025 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2026 goto label_invalid_code;
2027 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2028 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2029 continue;
2030
2031 case 'o': /* invocation of locking-shift-3 */
2032 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2033 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2034 goto label_invalid_code;
2035 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2036 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2037 continue;
2038
2039 case 'N': /* invocation of single-shift-2 */
2040 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2041 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2042 goto label_invalid_code;
2043 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2044 ONE_MORE_BYTE (c1);
2045 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2046 goto label_invalid_code;
2047 break;
2048
2049 case 'O': /* invocation of single-shift-3 */
2050 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2051 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052 goto label_invalid_code;
2053 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2054 ONE_MORE_BYTE (c1);
2055 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2056 goto label_invalid_code;
2057 break;
2058
2059 case '0': case '2': case '3': case '4': /* start composition */
2060 DECODE_COMPOSITION_START (c1);
2061 continue;
2062
2063 case '1': /* end composition */
2064 DECODE_COMPOSITION_END (c1);
2065 continue;
2066
2067 case '[': /* specification of direction */
2068 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2069 goto label_invalid_code;
2070 /* For the moment, nested direction is not supported.
2071 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2072 left-to-right, and nonzero means right-to-left. */
2073 ONE_MORE_BYTE (c1);
2074 switch (c1)
2075 {
2076 case ']': /* end of the current direction */
2077 coding->mode &= ~CODING_MODE_DIRECTION;
2078
2079 case '0': /* end of the current direction */
2080 case '1': /* start of left-to-right direction */
2081 ONE_MORE_BYTE (c1);
2082 if (c1 == ']')
2083 coding->mode &= ~CODING_MODE_DIRECTION;
2084 else
2085 goto label_invalid_code;
2086 break;
2087
2088 case '2': /* start of right-to-left direction */
2089 ONE_MORE_BYTE (c1);
2090 if (c1 == ']')
2091 coding->mode |= CODING_MODE_DIRECTION;
2092 else
2093 goto label_invalid_code;
2094 break;
2095
2096 default:
2097 goto label_invalid_code;
2098 }
2099 continue;
2100
2101 case '%':
2102 if (COMPOSING_P (coding))
2103 DECODE_COMPOSITION_END ('1');
2104 ONE_MORE_BYTE (c1);
2105 if (c1 == '/')
2106 {
2107 /* CTEXT extended segment:
2108 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2109 We keep these bytes as is for the moment.
2110 They may be decoded by post-read-conversion. */
2111 int dim, M, L;
2112 int size, required;
2113 int produced_chars;
2114
2115 ONE_MORE_BYTE (dim);
2116 ONE_MORE_BYTE (M);
2117 ONE_MORE_BYTE (L);
2118 size = ((M - 128) * 128) + (L - 128);
2119 required = 8 + size * 2;
2120 if (dst + required > (dst_bytes ? dst_end : src))
2121 goto label_end_of_loop;
2122 *dst++ = ISO_CODE_ESC;
2123 *dst++ = '%';
2124 *dst++ = '/';
2125 *dst++ = dim;
2126 produced_chars = 4;
2127 dst += CHAR_STRING (M, dst), produced_chars++;
2128 dst += CHAR_STRING (L, dst), produced_chars++;
2129 while (size-- > 0)
2130 {
2131 ONE_MORE_BYTE (c1);
2132 dst += CHAR_STRING (c1, dst), produced_chars++;
2133 }
2134 coding->produced_char += produced_chars;
2135 }
2136 else if (c1 == 'G')
2137 {
2138 unsigned char *d = dst;
2139 int produced_chars;
2140
2141 /* XFree86 extension for embedding UTF-8 in CTEXT:
2142 ESC % G --UTF-8-BYTES-- ESC % @
2143 We keep these bytes as is for the moment.
2144 They may be decoded by post-read-conversion. */
2145 if (d + 6 > (dst_bytes ? dst_end : src))
2146 goto label_end_of_loop;
2147 *d++ = ISO_CODE_ESC;
2148 *d++ = '%';
2149 *d++ = 'G';
2150 produced_chars = 3;
2151 while (d + 1 < (dst_bytes ? dst_end : src))
2152 {
2153 ONE_MORE_BYTE (c1);
2154 if (c1 == ISO_CODE_ESC
2155 && src + 1 < src_end
2156 && src[0] == '%'
2157 && src[1] == '@')
2158 {
2159 src += 2;
2160 break;
2161 }
2162 d += CHAR_STRING (c1, d), produced_chars++;
2163 }
2164 if (d + 3 > (dst_bytes ? dst_end : src))
2165 goto label_end_of_loop;
2166 *d++ = ISO_CODE_ESC;
2167 *d++ = '%';
2168 *d++ = '@';
2169 dst = d;
2170 coding->produced_char += produced_chars + 3;
2171 }
2172 else
2173 goto label_invalid_code;
2174 continue;
2175
2176 default:
2177 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2178 goto label_invalid_code;
2179 if (c1 >= 0x28 && c1 <= 0x2B)
2180 { /* designation of DIMENSION1_CHARS94 character set */
2181 ONE_MORE_BYTE (c2);
2182 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2183 }
2184 else if (c1 >= 0x2C && c1 <= 0x2F)
2185 { /* designation of DIMENSION1_CHARS96 character set */
2186 ONE_MORE_BYTE (c2);
2187 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2188 }
2189 else
2190 goto label_invalid_code;
2191 /* We must update these variables now. */
2192 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2193 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2194 continue;
2195 }
2196 }
2197
2198 /* Now we know CHARSET and 1st position code C1 of a character.
2199 Produce a multibyte sequence for that character while getting
2200 2nd position code C2 if necessary. */
2201 if (CHARSET_DIMENSION (charset) == 2)
2202 {
2203 ONE_MORE_BYTE (c2);
2204 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2205 /* C2 is not in a valid range. */
2206 goto label_invalid_code;
2207 }
2208 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2209 EMIT_CHAR (c);
2210 continue;
2211
2212 label_invalid_code:
2213 coding->errors++;
2214 if (COMPOSING_P (coding))
2215 DECODE_COMPOSITION_END ('1');
2216 src = src_base;
2217 c = *src++;
2218 if (! NILP (translation_table))
2219 c = translate_char (translation_table, c, 0, 0, 0);
2220 EMIT_CHAR (c);
2221 }
2222
2223 label_end_of_loop:
2224 coding->consumed = coding->consumed_char = src_base - source;
2225 coding->produced = dst - destination;
2226 return;
2227 }
2228
2229
2230 /* ISO2022 encoding stuff. */
2231
2232 /*
2233 It is not enough to say just "ISO2022" on encoding, we have to
2234 specify more details. In Emacs, each ISO2022 coding system
2235 variant has the following specifications:
2236 1. Initial designation to G0 through G3.
2237 2. Allows short-form designation?
2238 3. ASCII should be designated to G0 before control characters?
2239 4. ASCII should be designated to G0 at end of line?
2240 5. 7-bit environment or 8-bit environment?
2241 6. Use locking-shift?
2242 7. Use Single-shift?
2243 And the following two are only for Japanese:
2244 8. Use ASCII in place of JIS0201-1976-Roman?
2245 9. Use JISX0208-1983 in place of JISX0208-1978?
2246 These specifications are encoded in `coding->flags' as flag bits
2247 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2248 details.
2249 */
2250
2251 /* Produce codes (escape sequence) for designating CHARSET to graphic
2252 register REG at DST, and increment DST. If <final-char> of CHARSET is
2253 '@', 'A', or 'B' and the coding system CODING allows, produce
2254 designation sequence of short-form. */
2255
2256 #define ENCODE_DESIGNATION(charset, reg, coding) \
2257 do { \
2258 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2259 char *intermediate_char_94 = "()*+"; \
2260 char *intermediate_char_96 = ",-./"; \
2261 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2262 \
2263 if (revision < 255) \
2264 { \
2265 *dst++ = ISO_CODE_ESC; \
2266 *dst++ = '&'; \
2267 *dst++ = '@' + revision; \
2268 } \
2269 *dst++ = ISO_CODE_ESC; \
2270 if (CHARSET_DIMENSION (charset) == 1) \
2271 { \
2272 if (CHARSET_CHARS (charset) == 94) \
2273 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2274 else \
2275 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2276 } \
2277 else \
2278 { \
2279 *dst++ = '$'; \
2280 if (CHARSET_CHARS (charset) == 94) \
2281 { \
2282 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2283 || reg != 0 \
2284 || final_char < '@' || final_char > 'B') \
2285 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2286 } \
2287 else \
2288 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2289 } \
2290 *dst++ = final_char; \
2291 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2292 } while (0)
2293
2294 /* The following two macros produce codes (control character or escape
2295 sequence) for ISO2022 single-shift functions (single-shift-2 and
2296 single-shift-3). */
2297
2298 #define ENCODE_SINGLE_SHIFT_2 \
2299 do { \
2300 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2301 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2302 else \
2303 *dst++ = ISO_CODE_SS2; \
2304 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2305 } while (0)
2306
2307 #define ENCODE_SINGLE_SHIFT_3 \
2308 do { \
2309 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2310 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2311 else \
2312 *dst++ = ISO_CODE_SS3; \
2313 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2314 } while (0)
2315
2316 /* The following four macros produce codes (control character or
2317 escape sequence) for ISO2022 locking-shift functions (shift-in,
2318 shift-out, locking-shift-2, and locking-shift-3). */
2319
2320 #define ENCODE_SHIFT_IN \
2321 do { \
2322 *dst++ = ISO_CODE_SI; \
2323 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2324 } while (0)
2325
2326 #define ENCODE_SHIFT_OUT \
2327 do { \
2328 *dst++ = ISO_CODE_SO; \
2329 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2330 } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_2 \
2333 do { \
2334 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2335 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2336 } while (0)
2337
2338 #define ENCODE_LOCKING_SHIFT_3 \
2339 do { \
2340 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2341 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2342 } while (0)
2343
2344 /* Produce codes for a DIMENSION1 character whose character set is
2345 CHARSET and whose position-code is C1. Designation and invocation
2346 sequences are also produced in advance if necessary. */
2347
2348 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2349 do { \
2350 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2351 { \
2352 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2353 *dst++ = c1 & 0x7F; \
2354 else \
2355 *dst++ = c1 | 0x80; \
2356 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2357 break; \
2358 } \
2359 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2360 { \
2361 *dst++ = c1 & 0x7F; \
2362 break; \
2363 } \
2364 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2365 { \
2366 *dst++ = c1 | 0x80; \
2367 break; \
2368 } \
2369 else \
2370 /* Since CHARSET is not yet invoked to any graphic planes, we \
2371 must invoke it, or, at first, designate it to some graphic \
2372 register. Then repeat the loop to actually produce the \
2373 character. */ \
2374 dst = encode_invocation_designation (charset, coding, dst); \
2375 } while (1)
2376
2377 /* Produce codes for a DIMENSION2 character whose character set is
2378 CHARSET and whose position-codes are C1 and C2. Designation and
2379 invocation codes are also produced in advance if necessary. */
2380
2381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2382 do { \
2383 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2384 { \
2385 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2386 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2387 else \
2388 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2389 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2390 break; \
2391 } \
2392 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2393 { \
2394 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2395 break; \
2396 } \
2397 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2398 { \
2399 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2400 break; \
2401 } \
2402 else \
2403 /* Since CHARSET is not yet invoked to any graphic planes, we \
2404 must invoke it, or, at first, designate it to some graphic \
2405 register. Then repeat the loop to actually produce the \
2406 character. */ \
2407 dst = encode_invocation_designation (charset, coding, dst); \
2408 } while (1)
2409
2410 #define ENCODE_ISO_CHARACTER(c) \
2411 do { \
2412 int charset, c1, c2; \
2413 \
2414 SPLIT_CHAR (c, charset, c1, c2); \
2415 if (CHARSET_DEFINED_P (charset)) \
2416 { \
2417 if (CHARSET_DIMENSION (charset) == 1) \
2418 { \
2419 if (charset == CHARSET_ASCII \
2420 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2421 charset = charset_latin_jisx0201; \
2422 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2423 } \
2424 else \
2425 { \
2426 if (charset == charset_jisx0208 \
2427 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2428 charset = charset_jisx0208_1978; \
2429 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2430 } \
2431 } \
2432 else \
2433 { \
2434 *dst++ = c1; \
2435 if (c2 >= 0) \
2436 *dst++ = c2; \
2437 } \
2438 } while (0)
2439
2440
2441 /* Instead of encoding character C, produce one or two `?'s. */
2442
2443 #define ENCODE_UNSAFE_CHARACTER(c) \
2444 do { \
2445 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2446 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2447 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2448 } while (0)
2449
2450
2451 /* Produce designation and invocation codes at a place pointed by DST
2452 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2453 Return new DST. */
2454
2455 unsigned char *
2456 encode_invocation_designation (charset, coding, dst)
2457 int charset;
2458 struct coding_system *coding;
2459 unsigned char *dst;
2460 {
2461 int reg; /* graphic register number */
2462
2463 /* At first, check designations. */
2464 for (reg = 0; reg < 4; reg++)
2465 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2466 break;
2467
2468 if (reg >= 4)
2469 {
2470 /* CHARSET is not yet designated to any graphic registers. */
2471 /* At first check the requested designation. */
2472 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2473 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2474 /* Since CHARSET requests no special designation, designate it
2475 to graphic register 0. */
2476 reg = 0;
2477
2478 ENCODE_DESIGNATION (charset, reg, coding);
2479 }
2480
2481 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2482 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2483 {
2484 /* Since the graphic register REG is not invoked to any graphic
2485 planes, invoke it to graphic plane 0. */
2486 switch (reg)
2487 {
2488 case 0: /* graphic register 0 */
2489 ENCODE_SHIFT_IN;
2490 break;
2491
2492 case 1: /* graphic register 1 */
2493 ENCODE_SHIFT_OUT;
2494 break;
2495
2496 case 2: /* graphic register 2 */
2497 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498 ENCODE_SINGLE_SHIFT_2;
2499 else
2500 ENCODE_LOCKING_SHIFT_2;
2501 break;
2502
2503 case 3: /* graphic register 3 */
2504 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2505 ENCODE_SINGLE_SHIFT_3;
2506 else
2507 ENCODE_LOCKING_SHIFT_3;
2508 break;
2509 }
2510 }
2511
2512 return dst;
2513 }
2514
2515 /* Produce 2-byte codes for encoded composition rule RULE. */
2516
2517 #define ENCODE_COMPOSITION_RULE(rule) \
2518 do { \
2519 int gref, nref; \
2520 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2521 *dst++ = 32 + 81 + gref; \
2522 *dst++ = 32 + nref; \
2523 } while (0)
2524
2525 /* Produce codes for indicating the start of a composition sequence
2526 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2527 which specify information about the composition. See the comment
2528 in coding.h for the format of DATA. */
2529
2530 #define ENCODE_COMPOSITION_START(coding, data) \
2531 do { \
2532 coding->composing = data[3]; \
2533 *dst++ = ISO_CODE_ESC; \
2534 if (coding->composing == COMPOSITION_RELATIVE) \
2535 *dst++ = '0'; \
2536 else \
2537 { \
2538 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2539 ? '3' : '4'); \
2540 coding->cmp_data_index = coding->cmp_data_start + 4; \
2541 coding->composition_rule_follows = 0; \
2542 } \
2543 } while (0)
2544
2545 /* Produce codes for indicating the end of the current composition. */
2546
2547 #define ENCODE_COMPOSITION_END(coding, data) \
2548 do { \
2549 *dst++ = ISO_CODE_ESC; \
2550 *dst++ = '1'; \
2551 coding->cmp_data_start += data[0]; \
2552 coding->composing = COMPOSITION_NO; \
2553 if (coding->cmp_data_start == coding->cmp_data->used \
2554 && coding->cmp_data->next) \
2555 { \
2556 coding->cmp_data = coding->cmp_data->next; \
2557 coding->cmp_data_start = 0; \
2558 } \
2559 } while (0)
2560
2561 /* Produce composition start sequence ESC 0. Here, this sequence
2562 doesn't mean the start of a new composition but means that we have
2563 just produced components (alternate chars and composition rules) of
2564 the composition and the actual text follows in SRC. */
2565
2566 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2567 do { \
2568 *dst++ = ISO_CODE_ESC; \
2569 *dst++ = '0'; \
2570 coding->composing = COMPOSITION_RELATIVE; \
2571 } while (0)
2572
2573 /* The following three macros produce codes for indicating direction
2574 of text. */
2575 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2576 do { \
2577 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2578 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2579 else \
2580 *dst++ = ISO_CODE_CSI; \
2581 } while (0)
2582
2583 #define ENCODE_DIRECTION_R2L \
2584 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2585
2586 #define ENCODE_DIRECTION_L2R \
2587 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2588
2589 /* Produce codes for designation and invocation to reset the graphic
2590 planes and registers to initial state. */
2591 #define ENCODE_RESET_PLANE_AND_REGISTER \
2592 do { \
2593 int reg; \
2594 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2595 ENCODE_SHIFT_IN; \
2596 for (reg = 0; reg < 4; reg++) \
2597 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2598 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2599 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2600 ENCODE_DESIGNATION \
2601 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2602 } while (0)
2603
2604 /* Produce designation sequences of charsets in the line started from
2605 SRC to a place pointed by DST, and return updated DST.
2606
2607 If the current block ends before any end-of-line, we may fail to
2608 find all the necessary designations. */
2609
2610 static unsigned char *
2611 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2612 struct coding_system *coding;
2613 Lisp_Object translation_table;
2614 const unsigned char *src, *src_end;
2615 unsigned char *dst;
2616 {
2617 int charset, c, found = 0, reg;
2618 /* Table of charsets to be designated to each graphic register. */
2619 int r[4];
2620
2621 for (reg = 0; reg < 4; reg++)
2622 r[reg] = -1;
2623
2624 while (found < 4)
2625 {
2626 ONE_MORE_CHAR (c);
2627 if (c == '\n')
2628 break;
2629
2630 charset = CHAR_CHARSET (c);
2631 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2632 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2633 {
2634 found++;
2635 r[reg] = charset;
2636 }
2637 }
2638
2639 label_end_of_loop:
2640 if (found)
2641 {
2642 for (reg = 0; reg < 4; reg++)
2643 if (r[reg] >= 0
2644 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2645 ENCODE_DESIGNATION (r[reg], reg, coding);
2646 }
2647
2648 return dst;
2649 }
2650
2651 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2652
2653 static void
2654 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2655 struct coding_system *coding;
2656 const unsigned char *source;
2657 unsigned char *destination;
2658 int src_bytes, dst_bytes;
2659 {
2660 const unsigned char *src = source;
2661 const unsigned char *src_end = source + src_bytes;
2662 unsigned char *dst = destination;
2663 unsigned char *dst_end = destination + dst_bytes;
2664 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2665 from DST_END to assure overflow checking is necessary only at the
2666 head of loop. */
2667 unsigned char *adjusted_dst_end = dst_end - 19;
2668 /* SRC_BASE remembers the start position in source in each loop.
2669 The loop will be exited when there's not enough source text to
2670 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2671 there's not enough destination area to produce encoded codes
2672 (within macro EMIT_BYTES). */
2673 const unsigned char *src_base;
2674 int c;
2675 Lisp_Object translation_table;
2676 Lisp_Object safe_chars;
2677
2678 if (coding->flags & CODING_FLAG_ISO_SAFE)
2679 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2680
2681 safe_chars = coding_safe_chars (coding->symbol);
2682
2683 if (NILP (Venable_character_translation))
2684 translation_table = Qnil;
2685 else
2686 {
2687 translation_table = coding->translation_table_for_encode;
2688 if (NILP (translation_table))
2689 translation_table = Vstandard_translation_table_for_encode;
2690 }
2691
2692 coding->consumed_char = 0;
2693 coding->errors = 0;
2694 while (1)
2695 {
2696 src_base = src;
2697
2698 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2699 {
2700 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2701 break;
2702 }
2703
2704 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2705 && CODING_SPEC_ISO_BOL (coding))
2706 {
2707 /* We have to produce designation sequences if any now. */
2708 dst = encode_designation_at_bol (coding, translation_table,
2709 src, src_end, dst);
2710 CODING_SPEC_ISO_BOL (coding) = 0;
2711 }
2712
2713 /* Check composition start and end. */
2714 if (coding->composing != COMPOSITION_DISABLED
2715 && coding->cmp_data_start < coding->cmp_data->used)
2716 {
2717 struct composition_data *cmp_data = coding->cmp_data;
2718 int *data = cmp_data->data + coding->cmp_data_start;
2719 int this_pos = cmp_data->char_offset + coding->consumed_char;
2720
2721 if (coding->composing == COMPOSITION_RELATIVE)
2722 {
2723 if (this_pos == data[2])
2724 {
2725 ENCODE_COMPOSITION_END (coding, data);
2726 cmp_data = coding->cmp_data;
2727 data = cmp_data->data + coding->cmp_data_start;
2728 }
2729 }
2730 else if (COMPOSING_P (coding))
2731 {
2732 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2733 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2734 /* We have consumed components of the composition.
2735 What follows in SRC is the composition's base
2736 text. */
2737 ENCODE_COMPOSITION_FAKE_START (coding);
2738 else
2739 {
2740 int c = cmp_data->data[coding->cmp_data_index++];
2741 if (coding->composition_rule_follows)
2742 {
2743 ENCODE_COMPOSITION_RULE (c);
2744 coding->composition_rule_follows = 0;
2745 }
2746 else
2747 {
2748 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750 ENCODE_UNSAFE_CHARACTER (c);
2751 else
2752 ENCODE_ISO_CHARACTER (c);
2753 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2754 coding->composition_rule_follows = 1;
2755 }
2756 continue;
2757 }
2758 }
2759 if (!COMPOSING_P (coding))
2760 {
2761 if (this_pos == data[1])
2762 {
2763 ENCODE_COMPOSITION_START (coding, data);
2764 continue;
2765 }
2766 }
2767 }
2768
2769 ONE_MORE_CHAR (c);
2770
2771 /* Now encode the character C. */
2772 if (c < 0x20 || c == 0x7F)
2773 {
2774 if (c == '\r')
2775 {
2776 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2777 {
2778 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2779 ENCODE_RESET_PLANE_AND_REGISTER;
2780 *dst++ = c;
2781 continue;
2782 }
2783 /* fall down to treat '\r' as '\n' ... */
2784 c = '\n';
2785 }
2786 if (c == '\n')
2787 {
2788 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2789 ENCODE_RESET_PLANE_AND_REGISTER;
2790 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2791 bcopy (coding->spec.iso2022.initial_designation,
2792 coding->spec.iso2022.current_designation,
2793 sizeof coding->spec.iso2022.initial_designation);
2794 if (coding->eol_type == CODING_EOL_LF
2795 || coding->eol_type == CODING_EOL_UNDECIDED)
2796 *dst++ = ISO_CODE_LF;
2797 else if (coding->eol_type == CODING_EOL_CRLF)
2798 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2799 else
2800 *dst++ = ISO_CODE_CR;
2801 CODING_SPEC_ISO_BOL (coding) = 1;
2802 }
2803 else
2804 {
2805 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2806 ENCODE_RESET_PLANE_AND_REGISTER;
2807 *dst++ = c;
2808 }
2809 }
2810 else if (ASCII_BYTE_P (c))
2811 ENCODE_ISO_CHARACTER (c);
2812 else if (SINGLE_BYTE_CHAR_P (c))
2813 {
2814 *dst++ = c;
2815 coding->errors++;
2816 }
2817 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2818 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2819 ENCODE_UNSAFE_CHARACTER (c);
2820 else
2821 ENCODE_ISO_CHARACTER (c);
2822
2823 coding->consumed_char++;
2824 }
2825
2826 label_end_of_loop:
2827 coding->consumed = src_base - source;
2828 coding->produced = coding->produced_char = dst - destination;
2829 }
2830
2831 \f
2832 /*** 4. SJIS and BIG5 handlers ***/
2833
2834 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2835 quite widely. So, for the moment, Emacs supports them in the bare
2836 C code. But, in the future, they may be supported only by CCL. */
2837
2838 /* SJIS is a coding system encoding three character sets: ASCII, right
2839 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2840 as is. A character of charset katakana-jisx0201 is encoded by
2841 "position-code + 0x80". A character of charset japanese-jisx0208
2842 is encoded in 2-byte but two position-codes are divided and shifted
2843 so that it fits in the range below.
2844
2845 --- CODE RANGE of SJIS ---
2846 (character set) (range)
2847 ASCII 0x00 .. 0x7F
2848 KATAKANA-JISX0201 0xA1 .. 0xDF
2849 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2850 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2851 -------------------------------
2852
2853 */
2854
2855 /* BIG5 is a coding system encoding two character sets: ASCII and
2856 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2857 character set and is encoded in two bytes.
2858
2859 --- CODE RANGE of BIG5 ---
2860 (character set) (range)
2861 ASCII 0x00 .. 0x7F
2862 Big5 (1st byte) 0xA1 .. 0xFE
2863 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2864 --------------------------
2865
2866 Since the number of characters in Big5 is larger than maximum
2867 characters in Emacs' charset (96x96), it can't be handled as one
2868 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2869 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2870 contains frequently used characters and the latter contains less
2871 frequently used characters. */
2872
2873 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2874 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2875 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2876 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2877
2878 /* Number of Big5 characters which have the same code in 1st byte. */
2879 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2880
2881 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2882 do { \
2883 unsigned int temp \
2884 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2885 if (b1 < 0xC9) \
2886 charset = charset_big5_1; \
2887 else \
2888 { \
2889 charset = charset_big5_2; \
2890 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2891 } \
2892 c1 = temp / (0xFF - 0xA1) + 0x21; \
2893 c2 = temp % (0xFF - 0xA1) + 0x21; \
2894 } while (0)
2895
2896 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2897 do { \
2898 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2899 if (charset == charset_big5_2) \
2900 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2901 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2902 b2 = temp % BIG5_SAME_ROW; \
2903 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2904 } while (0)
2905
2906 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2907 Check if a text is encoded in SJIS. If it is, return
2908 CODING_CATEGORY_MASK_SJIS, else return 0. */
2909
2910 static int
2911 detect_coding_sjis (src, src_end, multibytep)
2912 unsigned char *src, *src_end;
2913 int multibytep;
2914 {
2915 int c;
2916 /* Dummy for ONE_MORE_BYTE. */
2917 struct coding_system dummy_coding;
2918 struct coding_system *coding = &dummy_coding;
2919
2920 while (1)
2921 {
2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2923 if (c < 0x80)
2924 continue;
2925 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926 return 0;
2927 if (c <= 0x9F || c >= 0xE0)
2928 {
2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2930 if (c < 0x40 || c == 0x7F || c > 0xFC)
2931 return 0;
2932 }
2933 }
2934 label_end_of_loop:
2935 return CODING_CATEGORY_MASK_SJIS;
2936 }
2937
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939 Check if a text is encoded in BIG5. If it is, return
2940 CODING_CATEGORY_MASK_BIG5, else return 0. */
2941
2942 static int
2943 detect_coding_big5 (src, src_end, multibytep)
2944 unsigned char *src, *src_end;
2945 int multibytep;
2946 {
2947 int c;
2948 /* Dummy for ONE_MORE_BYTE. */
2949 struct coding_system dummy_coding;
2950 struct coding_system *coding = &dummy_coding;
2951
2952 while (1)
2953 {
2954 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2955 if (c < 0x80)
2956 continue;
2957 if (c < 0xA1 || c > 0xFE)
2958 return 0;
2959 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2960 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2961 return 0;
2962 }
2963 label_end_of_loop:
2964 return CODING_CATEGORY_MASK_BIG5;
2965 }
2966
2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2968 Check if a text is encoded in UTF-8. If it is, return
2969 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2970
2971 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2972 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2973 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2974 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2975 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2976 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2977 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2978
2979 static int
2980 detect_coding_utf_8 (src, src_end, multibytep)
2981 unsigned char *src, *src_end;
2982 int multibytep;
2983 {
2984 unsigned char c;
2985 int seq_maybe_bytes;
2986 /* Dummy for ONE_MORE_BYTE. */
2987 struct coding_system dummy_coding;
2988 struct coding_system *coding = &dummy_coding;
2989
2990 while (1)
2991 {
2992 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2993 if (UTF_8_1_OCTET_P (c))
2994 continue;
2995 else if (UTF_8_2_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 1;
2997 else if (UTF_8_3_OCTET_LEADING_P (c))
2998 seq_maybe_bytes = 2;
2999 else if (UTF_8_4_OCTET_LEADING_P (c))
3000 seq_maybe_bytes = 3;
3001 else if (UTF_8_5_OCTET_LEADING_P (c))
3002 seq_maybe_bytes = 4;
3003 else if (UTF_8_6_OCTET_LEADING_P (c))
3004 seq_maybe_bytes = 5;
3005 else
3006 return 0;
3007
3008 do
3009 {
3010 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3011 if (!UTF_8_EXTRA_OCTET_P (c))
3012 return 0;
3013 seq_maybe_bytes--;
3014 }
3015 while (seq_maybe_bytes > 0);
3016 }
3017
3018 label_end_of_loop:
3019 return CODING_CATEGORY_MASK_UTF_8;
3020 }
3021
3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3023 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3024 Little Endian (otherwise). If it is, return
3025 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3026 else return 0. */
3027
3028 #define UTF_16_INVALID_P(val) \
3029 (((val) == 0xFFFE) \
3030 || ((val) == 0xFFFF))
3031
3032 #define UTF_16_HIGH_SURROGATE_P(val) \
3033 (((val) & 0xD800) == 0xD800)
3034
3035 #define UTF_16_LOW_SURROGATE_P(val) \
3036 (((val) & 0xDC00) == 0xDC00)
3037
3038 static int
3039 detect_coding_utf_16 (src, src_end, multibytep)
3040 unsigned char *src, *src_end;
3041 int multibytep;
3042 {
3043 unsigned char c1, c2;
3044 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3045 struct coding_system dummy_coding;
3046 struct coding_system *coding = &dummy_coding;
3047
3048 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3049 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3050
3051 if ((c1 == 0xFF) && (c2 == 0xFE))
3052 return CODING_CATEGORY_MASK_UTF_16_LE;
3053 else if ((c1 == 0xFE) && (c2 == 0xFF))
3054 return CODING_CATEGORY_MASK_UTF_16_BE;
3055
3056 label_end_of_loop:
3057 return 0;
3058 }
3059
3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3061 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3062
3063 static void
3064 decode_coding_sjis_big5 (coding, source, destination,
3065 src_bytes, dst_bytes, sjis_p)
3066 struct coding_system *coding;
3067 const unsigned char *source;
3068 unsigned char *destination;
3069 int src_bytes, dst_bytes;
3070 int sjis_p;
3071 {
3072 const unsigned char *src = source;
3073 const unsigned char *src_end = source + src_bytes;
3074 unsigned char *dst = destination;
3075 unsigned char *dst_end = destination + dst_bytes;
3076 /* SRC_BASE remembers the start position in source in each loop.
3077 The loop will be exited when there's not enough source code
3078 (within macro ONE_MORE_BYTE), or when there's not enough
3079 destination area to produce a character (within macro
3080 EMIT_CHAR). */
3081 const unsigned char *src_base;
3082 Lisp_Object translation_table;
3083
3084 if (NILP (Venable_character_translation))
3085 translation_table = Qnil;
3086 else
3087 {
3088 translation_table = coding->translation_table_for_decode;
3089 if (NILP (translation_table))
3090 translation_table = Vstandard_translation_table_for_decode;
3091 }
3092
3093 coding->produced_char = 0;
3094 while (1)
3095 {
3096 int c, charset, c1, c2 = 0;
3097
3098 src_base = src;
3099 ONE_MORE_BYTE (c1);
3100
3101 if (c1 < 0x80)
3102 {
3103 charset = CHARSET_ASCII;
3104 if (c1 < 0x20)
3105 {
3106 if (c1 == '\r')
3107 {
3108 if (coding->eol_type == CODING_EOL_CRLF)
3109 {
3110 ONE_MORE_BYTE (c2);
3111 if (c2 == '\n')
3112 c1 = c2;
3113 else
3114 /* To process C2 again, SRC is subtracted by 1. */
3115 src--;
3116 }
3117 else if (coding->eol_type == CODING_EOL_CR)
3118 c1 = '\n';
3119 }
3120 else if (c1 == '\n'
3121 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3122 && (coding->eol_type == CODING_EOL_CR
3123 || coding->eol_type == CODING_EOL_CRLF))
3124 {
3125 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3126 goto label_end_of_loop;
3127 }
3128 }
3129 }
3130 else
3131 {
3132 if (sjis_p)
3133 {
3134 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3135 goto label_invalid_code;
3136 if (c1 <= 0x9F || c1 >= 0xE0)
3137 {
3138 /* SJIS -> JISX0208 */
3139 ONE_MORE_BYTE (c2);
3140 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3141 goto label_invalid_code;
3142 DECODE_SJIS (c1, c2, c1, c2);
3143 charset = charset_jisx0208;
3144 }
3145 else
3146 /* SJIS -> JISX0201-Kana */
3147 charset = charset_katakana_jisx0201;
3148 }
3149 else
3150 {
3151 /* BIG5 -> Big5 */
3152 if (c1 < 0xA0 || c1 > 0xFE)
3153 goto label_invalid_code;
3154 ONE_MORE_BYTE (c2);
3155 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3156 goto label_invalid_code;
3157 DECODE_BIG5 (c1, c2, charset, c1, c2);
3158 }
3159 }
3160
3161 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3162 EMIT_CHAR (c);
3163 continue;
3164
3165 label_invalid_code:
3166 coding->errors++;
3167 src = src_base;
3168 c = *src++;
3169 EMIT_CHAR (c);
3170 }
3171
3172 label_end_of_loop:
3173 coding->consumed = coding->consumed_char = src_base - source;
3174 coding->produced = dst - destination;
3175 return;
3176 }
3177
3178 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3179 This function can encode charsets `ascii', `katakana-jisx0201',
3180 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3181 are sure that all these charsets are registered as official charset
3182 (i.e. do not have extended leading-codes). Characters of other
3183 charsets are produced without any encoding. If SJIS_P is 1, encode
3184 SJIS text, else encode BIG5 text. */
3185
3186 static void
3187 encode_coding_sjis_big5 (coding, source, destination,
3188 src_bytes, dst_bytes, sjis_p)
3189 struct coding_system *coding;
3190 unsigned char *source, *destination;
3191 int src_bytes, dst_bytes;
3192 int sjis_p;
3193 {
3194 unsigned char *src = source;
3195 unsigned char *src_end = source + src_bytes;
3196 unsigned char *dst = destination;
3197 unsigned char *dst_end = destination + dst_bytes;
3198 /* SRC_BASE remembers the start position in source in each loop.
3199 The loop will be exited when there's not enough source text to
3200 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3201 there's not enough destination area to produce encoded codes
3202 (within macro EMIT_BYTES). */
3203 unsigned char *src_base;
3204 Lisp_Object translation_table;
3205
3206 if (NILP (Venable_character_translation))
3207 translation_table = Qnil;
3208 else
3209 {
3210 translation_table = coding->translation_table_for_encode;
3211 if (NILP (translation_table))
3212 translation_table = Vstandard_translation_table_for_encode;
3213 }
3214
3215 while (1)
3216 {
3217 int c, charset, c1, c2;
3218
3219 src_base = src;
3220 ONE_MORE_CHAR (c);
3221
3222 /* Now encode the character C. */
3223 if (SINGLE_BYTE_CHAR_P (c))
3224 {
3225 switch (c)
3226 {
3227 case '\r':
3228 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3229 {
3230 EMIT_ONE_BYTE (c);
3231 break;
3232 }
3233 c = '\n';
3234 case '\n':
3235 if (coding->eol_type == CODING_EOL_CRLF)
3236 {
3237 EMIT_TWO_BYTES ('\r', c);
3238 break;
3239 }
3240 else if (coding->eol_type == CODING_EOL_CR)
3241 c = '\r';
3242 default:
3243 EMIT_ONE_BYTE (c);
3244 }
3245 }
3246 else
3247 {
3248 SPLIT_CHAR (c, charset, c1, c2);
3249 if (sjis_p)
3250 {
3251 if (charset == charset_jisx0208
3252 || charset == charset_jisx0208_1978)
3253 {
3254 ENCODE_SJIS (c1, c2, c1, c2);
3255 EMIT_TWO_BYTES (c1, c2);
3256 }
3257 else if (charset == charset_katakana_jisx0201)
3258 EMIT_ONE_BYTE (c1 | 0x80);
3259 else if (charset == charset_latin_jisx0201)
3260 EMIT_ONE_BYTE (c1);
3261 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3262 {
3263 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3264 if (CHARSET_WIDTH (charset) > 1)
3265 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3266 }
3267 else
3268 /* There's no way other than producing the internal
3269 codes as is. */
3270 EMIT_BYTES (src_base, src);
3271 }
3272 else
3273 {
3274 if (charset == charset_big5_1 || charset == charset_big5_2)
3275 {
3276 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3277 EMIT_TWO_BYTES (c1, c2);
3278 }
3279 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3280 {
3281 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3282 if (CHARSET_WIDTH (charset) > 1)
3283 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3284 }
3285 else
3286 /* There's no way other than producing the internal
3287 codes as is. */
3288 EMIT_BYTES (src_base, src);
3289 }
3290 }
3291 coding->consumed_char++;
3292 }
3293
3294 label_end_of_loop:
3295 coding->consumed = src_base - source;
3296 coding->produced = coding->produced_char = dst - destination;
3297 }
3298
3299 \f
3300 /*** 5. CCL handlers ***/
3301
3302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3303 Check if a text is encoded in a coding system of which
3304 encoder/decoder are written in CCL program. If it is, return
3305 CODING_CATEGORY_MASK_CCL, else return 0. */
3306
3307 static int
3308 detect_coding_ccl (src, src_end, multibytep)
3309 unsigned char *src, *src_end;
3310 int multibytep;
3311 {
3312 unsigned char *valid;
3313 int c;
3314 /* Dummy for ONE_MORE_BYTE. */
3315 struct coding_system dummy_coding;
3316 struct coding_system *coding = &dummy_coding;
3317
3318 /* No coding system is assigned to coding-category-ccl. */
3319 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3320 return 0;
3321
3322 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3323 while (1)
3324 {
3325 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3326 if (! valid[c])
3327 return 0;
3328 }
3329 label_end_of_loop:
3330 return CODING_CATEGORY_MASK_CCL;
3331 }
3332
3333 \f
3334 /*** 6. End-of-line handlers ***/
3335
3336 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3337
3338 static void
3339 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3340 struct coding_system *coding;
3341 const unsigned char *source;
3342 unsigned char *destination;
3343 int src_bytes, dst_bytes;
3344 {
3345 const unsigned char *src = source;
3346 unsigned char *dst = destination;
3347 const unsigned char *src_end = src + src_bytes;
3348 unsigned char *dst_end = dst + dst_bytes;
3349 Lisp_Object translation_table;
3350 /* SRC_BASE remembers the start position in source in each loop.
3351 The loop will be exited when there's not enough source code
3352 (within macro ONE_MORE_BYTE), or when there's not enough
3353 destination area to produce a character (within macro
3354 EMIT_CHAR). */
3355 const unsigned char *src_base;
3356 int c;
3357
3358 translation_table = Qnil;
3359 switch (coding->eol_type)
3360 {
3361 case CODING_EOL_CRLF:
3362 while (1)
3363 {
3364 src_base = src;
3365 ONE_MORE_BYTE (c);
3366 if (c == '\r')
3367 {
3368 ONE_MORE_BYTE (c);
3369 if (c != '\n')
3370 {
3371 src--;
3372 c = '\r';
3373 }
3374 }
3375 else if (c == '\n'
3376 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3377 {
3378 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3379 goto label_end_of_loop;
3380 }
3381 EMIT_CHAR (c);
3382 }
3383 break;
3384
3385 case CODING_EOL_CR:
3386 while (1)
3387 {
3388 src_base = src;
3389 ONE_MORE_BYTE (c);
3390 if (c == '\n')
3391 {
3392 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3393 {
3394 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3395 goto label_end_of_loop;
3396 }
3397 }
3398 else if (c == '\r')
3399 c = '\n';
3400 EMIT_CHAR (c);
3401 }
3402 break;
3403
3404 default: /* no need for EOL handling */
3405 while (1)
3406 {
3407 src_base = src;
3408 ONE_MORE_BYTE (c);
3409 EMIT_CHAR (c);
3410 }
3411 }
3412
3413 label_end_of_loop:
3414 coding->consumed = coding->consumed_char = src_base - source;
3415 coding->produced = dst - destination;
3416 return;
3417 }
3418
3419 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3420 format of end-of-line according to `coding->eol_type'. It also
3421 convert multibyte form 8-bit characters to unibyte if
3422 CODING->src_multibyte is nonzero. If `coding->mode &
3423 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3424 also means end-of-line. */
3425
3426 static void
3427 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3428 struct coding_system *coding;
3429 const unsigned char *source;
3430 unsigned char *destination;
3431 int src_bytes, dst_bytes;
3432 {
3433 const unsigned char *src = source;
3434 unsigned char *dst = destination;
3435 const unsigned char *src_end = src + src_bytes;
3436 unsigned char *dst_end = dst + dst_bytes;
3437 Lisp_Object translation_table;
3438 /* SRC_BASE remembers the start position in source in each loop.
3439 The loop will be exited when there's not enough source text to
3440 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3441 there's not enough destination area to produce encoded codes
3442 (within macro EMIT_BYTES). */
3443 const unsigned char *src_base;
3444 unsigned char *tmp;
3445 int c;
3446 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3447
3448 translation_table = Qnil;
3449 if (coding->src_multibyte
3450 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3451 {
3452 src_end--;
3453 src_bytes--;
3454 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3455 }
3456
3457 if (coding->eol_type == CODING_EOL_CRLF)
3458 {
3459 while (src < src_end)
3460 {
3461 src_base = src;
3462 c = *src++;
3463 if (c >= 0x20)
3464 EMIT_ONE_BYTE (c);
3465 else if (c == '\n' || (c == '\r' && selective_display))
3466 EMIT_TWO_BYTES ('\r', '\n');
3467 else
3468 EMIT_ONE_BYTE (c);
3469 }
3470 src_base = src;
3471 label_end_of_loop:
3472 ;
3473 }
3474 else
3475 {
3476 if (!dst_bytes || src_bytes <= dst_bytes)
3477 {
3478 safe_bcopy (src, dst, src_bytes);
3479 src_base = src_end;
3480 dst += src_bytes;
3481 }
3482 else
3483 {
3484 if (coding->src_multibyte
3485 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3486 dst_bytes--;
3487 safe_bcopy (src, dst, dst_bytes);
3488 src_base = src + dst_bytes;
3489 dst = destination + dst_bytes;
3490 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3491 }
3492 if (coding->eol_type == CODING_EOL_CR)
3493 {
3494 for (tmp = destination; tmp < dst; tmp++)
3495 if (*tmp == '\n') *tmp = '\r';
3496 }
3497 else if (selective_display)
3498 {
3499 for (tmp = destination; tmp < dst; tmp++)
3500 if (*tmp == '\r') *tmp = '\n';
3501 }
3502 }
3503 if (coding->src_multibyte)
3504 dst = destination + str_as_unibyte (destination, dst - destination);
3505
3506 coding->consumed = src_base - source;
3507 coding->produced = dst - destination;
3508 coding->produced_char = coding->produced;
3509 }
3510
3511 \f
3512 /*** 7. C library functions ***/
3513
3514 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3515 has a property `coding-system'. The value of this property is a
3516 vector of length 5 (called the coding-vector). Among elements of
3517 this vector, the first (element[0]) and the fifth (element[4])
3518 carry important information for decoding/encoding. Before
3519 decoding/encoding, this information should be set in fields of a
3520 structure of type `coding_system'.
3521
3522 The value of the property `coding-system' can be a symbol of another
3523 subsidiary coding-system. In that case, Emacs gets coding-vector
3524 from that symbol.
3525
3526 `element[0]' contains information to be set in `coding->type'. The
3527 value and its meaning is as follows:
3528
3529 0 -- coding_type_emacs_mule
3530 1 -- coding_type_sjis
3531 2 -- coding_type_iso2022
3532 3 -- coding_type_big5
3533 4 -- coding_type_ccl encoder/decoder written in CCL
3534 nil -- coding_type_no_conversion
3535 t -- coding_type_undecided (automatic conversion on decoding,
3536 no-conversion on encoding)
3537
3538 `element[4]' contains information to be set in `coding->flags' and
3539 `coding->spec'. The meaning varies by `coding->type'.
3540
3541 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3542 of length 32 (of which the first 13 sub-elements are used now).
3543 Meanings of these sub-elements are:
3544
3545 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3546 If the value is an integer of valid charset, the charset is
3547 assumed to be designated to graphic register N initially.
3548
3549 If the value is minus, it is a minus value of charset which
3550 reserves graphic register N, which means that the charset is
3551 not designated initially but should be designated to graphic
3552 register N just before encoding a character in that charset.
3553
3554 If the value is nil, graphic register N is never used on
3555 encoding.
3556
3557 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3558 Each value takes t or nil. See the section ISO2022 of
3559 `coding.h' for more information.
3560
3561 If `coding->type' is `coding_type_big5', element[4] is t to denote
3562 BIG5-ETen or nil to denote BIG5-HKU.
3563
3564 If `coding->type' takes the other value, element[4] is ignored.
3565
3566 Emacs Lisp's coding systems also carry information about format of
3567 end-of-line in a value of property `eol-type'. If the value is
3568 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3569 means CODING_EOL_CR. If it is not integer, it should be a vector
3570 of subsidiary coding systems of which property `eol-type' has one
3571 of the above values.
3572
3573 */
3574
3575 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3576 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3577 is setup so that no conversion is necessary and return -1, else
3578 return 0. */
3579
3580 int
3581 setup_coding_system (coding_system, coding)
3582 Lisp_Object coding_system;
3583 struct coding_system *coding;
3584 {
3585 Lisp_Object coding_spec, coding_type, eol_type, plist;
3586 Lisp_Object val;
3587
3588 /* At first, zero clear all members. */
3589 bzero (coding, sizeof (struct coding_system));
3590
3591 /* Initialize some fields required for all kinds of coding systems. */
3592 coding->symbol = coding_system;
3593 coding->heading_ascii = -1;
3594 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3595 coding->composing = COMPOSITION_DISABLED;
3596 coding->cmp_data = NULL;
3597
3598 if (NILP (coding_system))
3599 goto label_invalid_coding_system;
3600
3601 coding_spec = Fget (coding_system, Qcoding_system);
3602
3603 if (!VECTORP (coding_spec)
3604 || XVECTOR (coding_spec)->size != 5
3605 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3606 goto label_invalid_coding_system;
3607
3608 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3609 if (VECTORP (eol_type))
3610 {
3611 coding->eol_type = CODING_EOL_UNDECIDED;
3612 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3613 if (system_eol_type != CODING_EOL_LF)
3614 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3615 }
3616 else if (XFASTINT (eol_type) == 1)
3617 {
3618 coding->eol_type = CODING_EOL_CRLF;
3619 coding->common_flags
3620 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3621 }
3622 else if (XFASTINT (eol_type) == 2)
3623 {
3624 coding->eol_type = CODING_EOL_CR;
3625 coding->common_flags
3626 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3627 }
3628 else
3629 {
3630 coding->common_flags = 0;
3631 coding->eol_type = CODING_EOL_LF;
3632 }
3633
3634 coding_type = XVECTOR (coding_spec)->contents[0];
3635 /* Try short cut. */
3636 if (SYMBOLP (coding_type))
3637 {
3638 if (EQ (coding_type, Qt))
3639 {
3640 coding->type = coding_type_undecided;
3641 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3642 }
3643 else
3644 coding->type = coding_type_no_conversion;
3645 /* Initialize this member. Any thing other than
3646 CODING_CATEGORY_IDX_UTF_16_BE and
3647 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3648 special treatment in detect_eol. */
3649 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3650
3651 return 0;
3652 }
3653
3654 /* Get values of coding system properties:
3655 `post-read-conversion', `pre-write-conversion',
3656 `translation-table-for-decode', `translation-table-for-encode'. */
3657 plist = XVECTOR (coding_spec)->contents[3];
3658 /* Pre & post conversion functions should be disabled if
3659 inhibit_eol_conversion is nonzero. This is the case that a code
3660 conversion function is called while those functions are running. */
3661 if (! inhibit_pre_post_conversion)
3662 {
3663 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3664 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3665 }
3666 val = Fplist_get (plist, Qtranslation_table_for_decode);
3667 if (SYMBOLP (val))
3668 val = Fget (val, Qtranslation_table_for_decode);
3669 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3670 val = Fplist_get (plist, Qtranslation_table_for_encode);
3671 if (SYMBOLP (val))
3672 val = Fget (val, Qtranslation_table_for_encode);
3673 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3674 val = Fplist_get (plist, Qcoding_category);
3675 if (!NILP (val))
3676 {
3677 val = Fget (val, Qcoding_category_index);
3678 if (INTEGERP (val))
3679 coding->category_idx = XINT (val);
3680 else
3681 goto label_invalid_coding_system;
3682 }
3683 else
3684 goto label_invalid_coding_system;
3685
3686 /* If the coding system has non-nil `composition' property, enable
3687 composition handling. */
3688 val = Fplist_get (plist, Qcomposition);
3689 if (!NILP (val))
3690 coding->composing = COMPOSITION_NO;
3691
3692 /* If the coding system is ascii-incompatible, record it in
3693 common_flags. */
3694 val = Fplist_get (plist, Qascii_incompatible);
3695 if (! NILP (val))
3696 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3697
3698 switch (XFASTINT (coding_type))
3699 {
3700 case 0:
3701 coding->type = coding_type_emacs_mule;
3702 coding->common_flags
3703 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3704 if (!NILP (coding->post_read_conversion))
3705 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3706 if (!NILP (coding->pre_write_conversion))
3707 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3708 break;
3709
3710 case 1:
3711 coding->type = coding_type_sjis;
3712 coding->common_flags
3713 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3714 break;
3715
3716 case 2:
3717 coding->type = coding_type_iso2022;
3718 coding->common_flags
3719 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3720 {
3721 Lisp_Object val, temp;
3722 Lisp_Object *flags;
3723 int i, charset, reg_bits = 0;
3724
3725 val = XVECTOR (coding_spec)->contents[4];
3726
3727 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3728 goto label_invalid_coding_system;
3729
3730 flags = XVECTOR (val)->contents;
3731 coding->flags
3732 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3733 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3734 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3735 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3736 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3737 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3738 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3739 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3740 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3741 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3742 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3743 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3744 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3745 );
3746
3747 /* Invoke graphic register 0 to plane 0. */
3748 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3749 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3750 CODING_SPEC_ISO_INVOCATION (coding, 1)
3751 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3752 /* Not single shifting at first. */
3753 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3754 /* Beginning of buffer should also be regarded as bol. */
3755 CODING_SPEC_ISO_BOL (coding) = 1;
3756
3757 for (charset = 0; charset <= MAX_CHARSET; charset++)
3758 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3759 val = Vcharset_revision_alist;
3760 while (CONSP (val))
3761 {
3762 charset = get_charset_id (Fcar_safe (XCAR (val)));
3763 if (charset >= 0
3764 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3765 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3766 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3767 val = XCDR (val);
3768 }
3769
3770 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3771 FLAGS[REG] can be one of below:
3772 integer CHARSET: CHARSET occupies register I,
3773 t: designate nothing to REG initially, but can be used
3774 by any charsets,
3775 list of integer, nil, or t: designate the first
3776 element (if integer) to REG initially, the remaining
3777 elements (if integer) is designated to REG on request,
3778 if an element is t, REG can be used by any charsets,
3779 nil: REG is never used. */
3780 for (charset = 0; charset <= MAX_CHARSET; charset++)
3781 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3782 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3783 for (i = 0; i < 4; i++)
3784 {
3785 if ((INTEGERP (flags[i])
3786 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3787 || (charset = get_charset_id (flags[i])) >= 0)
3788 {
3789 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3790 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3791 }
3792 else if (EQ (flags[i], Qt))
3793 {
3794 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3795 reg_bits |= 1 << i;
3796 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3797 }
3798 else if (CONSP (flags[i]))
3799 {
3800 Lisp_Object tail;
3801 tail = flags[i];
3802
3803 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3804 if ((INTEGERP (XCAR (tail))
3805 && (charset = XINT (XCAR (tail)),
3806 CHARSET_VALID_P (charset)))
3807 || (charset = get_charset_id (XCAR (tail))) >= 0)
3808 {
3809 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3810 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3811 }
3812 else
3813 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3814 tail = XCDR (tail);
3815 while (CONSP (tail))
3816 {
3817 if ((INTEGERP (XCAR (tail))
3818 && (charset = XINT (XCAR (tail)),
3819 CHARSET_VALID_P (charset)))
3820 || (charset = get_charset_id (XCAR (tail))) >= 0)
3821 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3822 = i;
3823 else if (EQ (XCAR (tail), Qt))
3824 reg_bits |= 1 << i;
3825 tail = XCDR (tail);
3826 }
3827 }
3828 else
3829 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3830
3831 CODING_SPEC_ISO_DESIGNATION (coding, i)
3832 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3833 }
3834
3835 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3836 {
3837 /* REG 1 can be used only by locking shift in 7-bit env. */
3838 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3839 reg_bits &= ~2;
3840 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3841 /* Without any shifting, only REG 0 and 1 can be used. */
3842 reg_bits &= 3;
3843 }
3844
3845 if (reg_bits)
3846 for (charset = 0; charset <= MAX_CHARSET; charset++)
3847 {
3848 if (CHARSET_DEFINED_P (charset)
3849 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3850 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3851 {
3852 /* There exist some default graphic registers to be
3853 used by CHARSET. */
3854
3855 /* We had better avoid designating a charset of
3856 CHARS96 to REG 0 as far as possible. */
3857 if (CHARSET_CHARS (charset) == 96)
3858 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3859 = (reg_bits & 2
3860 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3861 else
3862 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3863 = (reg_bits & 1
3864 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3865 }
3866 }
3867 }
3868 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3869 coding->spec.iso2022.last_invalid_designation_register = -1;
3870 break;
3871
3872 case 3:
3873 coding->type = coding_type_big5;
3874 coding->common_flags
3875 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3876 coding->flags
3877 = (NILP (XVECTOR (coding_spec)->contents[4])
3878 ? CODING_FLAG_BIG5_HKU
3879 : CODING_FLAG_BIG5_ETEN);
3880 break;
3881
3882 case 4:
3883 coding->type = coding_type_ccl;
3884 coding->common_flags
3885 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3886 {
3887 val = XVECTOR (coding_spec)->contents[4];
3888 if (! CONSP (val)
3889 || setup_ccl_program (&(coding->spec.ccl.decoder),
3890 XCAR (val)) < 0
3891 || setup_ccl_program (&(coding->spec.ccl.encoder),
3892 XCDR (val)) < 0)
3893 goto label_invalid_coding_system;
3894
3895 bzero (coding->spec.ccl.valid_codes, 256);
3896 val = Fplist_get (plist, Qvalid_codes);
3897 if (CONSP (val))
3898 {
3899 Lisp_Object this;
3900
3901 for (; CONSP (val); val = XCDR (val))
3902 {
3903 this = XCAR (val);
3904 if (INTEGERP (this)
3905 && XINT (this) >= 0 && XINT (this) < 256)
3906 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3907 else if (CONSP (this)
3908 && INTEGERP (XCAR (this))
3909 && INTEGERP (XCDR (this)))
3910 {
3911 int start = XINT (XCAR (this));
3912 int end = XINT (XCDR (this));
3913
3914 if (start >= 0 && start <= end && end < 256)
3915 while (start <= end)
3916 coding->spec.ccl.valid_codes[start++] = 1;
3917 }
3918 }
3919 }
3920 }
3921 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3922 coding->spec.ccl.cr_carryover = 0;
3923 coding->spec.ccl.eight_bit_carryover[0] = 0;
3924 break;
3925
3926 case 5:
3927 coding->type = coding_type_raw_text;
3928 break;
3929
3930 default:
3931 goto label_invalid_coding_system;
3932 }
3933 return 0;
3934
3935 label_invalid_coding_system:
3936 coding->type = coding_type_no_conversion;
3937 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3938 coding->common_flags = 0;
3939 coding->eol_type = CODING_EOL_UNDECIDED;
3940 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3941 return NILP (coding_system) ? 0 : -1;
3942 }
3943
3944 /* Free memory blocks allocated for storing composition information. */
3945
3946 void
3947 coding_free_composition_data (coding)
3948 struct coding_system *coding;
3949 {
3950 struct composition_data *cmp_data = coding->cmp_data, *next;
3951
3952 if (!cmp_data)
3953 return;
3954 /* Memory blocks are chained. At first, rewind to the first, then,
3955 free blocks one by one. */
3956 while (cmp_data->prev)
3957 cmp_data = cmp_data->prev;
3958 while (cmp_data)
3959 {
3960 next = cmp_data->next;
3961 xfree (cmp_data);
3962 cmp_data = next;
3963 }
3964 coding->cmp_data = NULL;
3965 }
3966
3967 /* Set `char_offset' member of all memory blocks pointed by
3968 coding->cmp_data to POS. */
3969
3970 void
3971 coding_adjust_composition_offset (coding, pos)
3972 struct coding_system *coding;
3973 int pos;
3974 {
3975 struct composition_data *cmp_data;
3976
3977 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3978 cmp_data->char_offset = pos;
3979 }
3980
3981 /* Setup raw-text or one of its subsidiaries in the structure
3982 coding_system CODING according to the already setup value eol_type
3983 in CODING. CODING should be setup for some coding system in
3984 advance. */
3985
3986 void
3987 setup_raw_text_coding_system (coding)
3988 struct coding_system *coding;
3989 {
3990 if (coding->type != coding_type_raw_text)
3991 {
3992 coding->symbol = Qraw_text;
3993 coding->type = coding_type_raw_text;
3994 if (coding->eol_type != CODING_EOL_UNDECIDED)
3995 {
3996 Lisp_Object subsidiaries;
3997 subsidiaries = Fget (Qraw_text, Qeol_type);
3998
3999 if (VECTORP (subsidiaries)
4000 && XVECTOR (subsidiaries)->size == 3)
4001 coding->symbol
4002 = XVECTOR (subsidiaries)->contents[coding->eol_type];
4003 }
4004 setup_coding_system (coding->symbol, coding);
4005 }
4006 return;
4007 }
4008
4009 /* Emacs has a mechanism to automatically detect a coding system if it
4010 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4011 it's impossible to distinguish some coding systems accurately
4012 because they use the same range of codes. So, at first, coding
4013 systems are categorized into 7, those are:
4014
4015 o coding-category-emacs-mule
4016
4017 The category for a coding system which has the same code range
4018 as Emacs' internal format. Assigned the coding-system (Lisp
4019 symbol) `emacs-mule' by default.
4020
4021 o coding-category-sjis
4022
4023 The category for a coding system which has the same code range
4024 as SJIS. Assigned the coding-system (Lisp
4025 symbol) `japanese-shift-jis' by default.
4026
4027 o coding-category-iso-7
4028
4029 The category for a coding system which has the same code range
4030 as ISO2022 of 7-bit environment. This doesn't use any locking
4031 shift and single shift functions. This can encode/decode all
4032 charsets. Assigned the coding-system (Lisp symbol)
4033 `iso-2022-7bit' by default.
4034
4035 o coding-category-iso-7-tight
4036
4037 Same as coding-category-iso-7 except that this can
4038 encode/decode only the specified charsets.
4039
4040 o coding-category-iso-8-1
4041
4042 The category for a coding system which has the same code range
4043 as ISO2022 of 8-bit environment and graphic plane 1 used only
4044 for DIMENSION1 charset. This doesn't use any locking shift
4045 and single shift functions. Assigned the coding-system (Lisp
4046 symbol) `iso-latin-1' by default.
4047
4048 o coding-category-iso-8-2
4049
4050 The category for a coding system which has the same code range
4051 as ISO2022 of 8-bit environment and graphic plane 1 used only
4052 for DIMENSION2 charset. This doesn't use any locking shift
4053 and single shift functions. Assigned the coding-system (Lisp
4054 symbol) `japanese-iso-8bit' by default.
4055
4056 o coding-category-iso-7-else
4057
4058 The category for a coding system which has the same code range
4059 as ISO2022 of 7-bit environment but uses locking shift or
4060 single shift functions. Assigned the coding-system (Lisp
4061 symbol) `iso-2022-7bit-lock' by default.
4062
4063 o coding-category-iso-8-else
4064
4065 The category for a coding system which has the same code range
4066 as ISO2022 of 8-bit environment but uses locking shift or
4067 single shift functions. Assigned the coding-system (Lisp
4068 symbol) `iso-2022-8bit-ss2' by default.
4069
4070 o coding-category-big5
4071
4072 The category for a coding system which has the same code range
4073 as BIG5. Assigned the coding-system (Lisp symbol)
4074 `cn-big5' by default.
4075
4076 o coding-category-utf-8
4077
4078 The category for a coding system which has the same code range
4079 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4080 symbol) `utf-8' by default.
4081
4082 o coding-category-utf-16-be
4083
4084 The category for a coding system in which a text has an
4085 Unicode signature (cf. Unicode Standard) in the order of BIG
4086 endian at the head. Assigned the coding-system (Lisp symbol)
4087 `utf-16-be' by default.
4088
4089 o coding-category-utf-16-le
4090
4091 The category for a coding system in which a text has an
4092 Unicode signature (cf. Unicode Standard) in the order of
4093 LITTLE endian at the head. Assigned the coding-system (Lisp
4094 symbol) `utf-16-le' by default.
4095
4096 o coding-category-ccl
4097
4098 The category for a coding system of which encoder/decoder is
4099 written in CCL programs. The default value is nil, i.e., no
4100 coding system is assigned.
4101
4102 o coding-category-binary
4103
4104 The category for a coding system not categorized in any of the
4105 above. Assigned the coding-system (Lisp symbol)
4106 `no-conversion' by default.
4107
4108 Each of them is a Lisp symbol and the value is an actual
4109 `coding-system' (this is also a Lisp symbol) assigned by a user.
4110 What Emacs does actually is to detect a category of coding system.
4111 Then, it uses a `coding-system' assigned to it. If Emacs can't
4112 decide a single possible category, it selects a category of the
4113 highest priority. Priorities of categories are also specified by a
4114 user in a Lisp variable `coding-category-list'.
4115
4116 */
4117
4118 static
4119 int ascii_skip_code[256];
4120
4121 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4122 If it detects possible coding systems, return an integer in which
4123 appropriate flag bits are set. Flag bits are defined by macros
4124 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4125 it should point the table `coding_priorities'. In that case, only
4126 the flag bit for a coding system of the highest priority is set in
4127 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4128 range 0x80..0x9F are in multibyte form.
4129
4130 How many ASCII characters are at the head is returned as *SKIP. */
4131
4132 static int
4133 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4134 unsigned char *source;
4135 int src_bytes, *priorities, *skip;
4136 int multibytep;
4137 {
4138 register unsigned char c;
4139 unsigned char *src = source, *src_end = source + src_bytes;
4140 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4141 int i;
4142
4143 /* At first, skip all ASCII characters and control characters except
4144 for three ISO2022 specific control characters. */
4145 ascii_skip_code[ISO_CODE_SO] = 0;
4146 ascii_skip_code[ISO_CODE_SI] = 0;
4147 ascii_skip_code[ISO_CODE_ESC] = 0;
4148
4149 label_loop_detect_coding:
4150 while (src < src_end && ascii_skip_code[*src]) src++;
4151 *skip = src - source;
4152
4153 if (src >= src_end)
4154 /* We found nothing other than ASCII. There's nothing to do. */
4155 return 0;
4156
4157 c = *src;
4158 /* The text seems to be encoded in some multilingual coding system.
4159 Now, try to find in which coding system the text is encoded. */
4160 if (c < 0x80)
4161 {
4162 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4163 /* C is an ISO2022 specific control code of C0. */
4164 mask = detect_coding_iso2022 (src, src_end, multibytep);
4165 if (mask == 0)
4166 {
4167 /* No valid ISO2022 code follows C. Try again. */
4168 src++;
4169 if (c == ISO_CODE_ESC)
4170 ascii_skip_code[ISO_CODE_ESC] = 1;
4171 else
4172 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4173 goto label_loop_detect_coding;
4174 }
4175 if (priorities)
4176 {
4177 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4178 {
4179 if (mask & priorities[i])
4180 return priorities[i];
4181 }
4182 return CODING_CATEGORY_MASK_RAW_TEXT;
4183 }
4184 }
4185 else
4186 {
4187 int try;
4188
4189 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4190 c = src[1] - 0x20;
4191
4192 if (c < 0xA0)
4193 {
4194 /* C is the first byte of SJIS character code,
4195 or a leading-code of Emacs' internal format (emacs-mule),
4196 or the first byte of UTF-16. */
4197 try = (CODING_CATEGORY_MASK_SJIS
4198 | CODING_CATEGORY_MASK_EMACS_MULE
4199 | CODING_CATEGORY_MASK_UTF_16_BE
4200 | CODING_CATEGORY_MASK_UTF_16_LE);
4201
4202 /* Or, if C is a special latin extra code,
4203 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4204 or is an ISO2022 control-sequence-introducer (CSI),
4205 we should also consider the possibility of ISO2022 codings. */
4206 if ((VECTORP (Vlatin_extra_code_table)
4207 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4208 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4209 || (c == ISO_CODE_CSI
4210 && (src < src_end
4211 && (*src == ']'
4212 || ((*src == '0' || *src == '1' || *src == '2')
4213 && src + 1 < src_end
4214 && src[1] == ']')))))
4215 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4216 | CODING_CATEGORY_MASK_ISO_8BIT);
4217 }
4218 else
4219 /* C is a character of ISO2022 in graphic plane right,
4220 or a SJIS's 1-byte character code (i.e. JISX0201),
4221 or the first byte of BIG5's 2-byte code,
4222 or the first byte of UTF-8/16. */
4223 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4224 | CODING_CATEGORY_MASK_ISO_8BIT
4225 | CODING_CATEGORY_MASK_SJIS
4226 | CODING_CATEGORY_MASK_BIG5
4227 | CODING_CATEGORY_MASK_UTF_8
4228 | CODING_CATEGORY_MASK_UTF_16_BE
4229 | CODING_CATEGORY_MASK_UTF_16_LE);
4230
4231 /* Or, we may have to consider the possibility of CCL. */
4232 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4233 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4234 ->spec.ccl.valid_codes)[c])
4235 try |= CODING_CATEGORY_MASK_CCL;
4236
4237 mask = 0;
4238 utf16_examined_p = iso2022_examined_p = 0;
4239 if (priorities)
4240 {
4241 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4242 {
4243 if (!iso2022_examined_p
4244 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4245 {
4246 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4247 iso2022_examined_p = 1;
4248 }
4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4250 mask |= detect_coding_sjis (src, src_end, multibytep);
4251 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4252 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4253 else if (!utf16_examined_p
4254 && (priorities[i] & try &
4255 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4256 {
4257 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4258 utf16_examined_p = 1;
4259 }
4260 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4261 mask |= detect_coding_big5 (src, src_end, multibytep);
4262 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4263 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4264 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4265 mask |= detect_coding_ccl (src, src_end, multibytep);
4266 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4267 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4268 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4269 mask |= CODING_CATEGORY_MASK_BINARY;
4270 if (mask & priorities[i])
4271 return priorities[i];
4272 }
4273 return CODING_CATEGORY_MASK_RAW_TEXT;
4274 }
4275 if (try & CODING_CATEGORY_MASK_ISO)
4276 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4277 if (try & CODING_CATEGORY_MASK_SJIS)
4278 mask |= detect_coding_sjis (src, src_end, multibytep);
4279 if (try & CODING_CATEGORY_MASK_BIG5)
4280 mask |= detect_coding_big5 (src, src_end, multibytep);
4281 if (try & CODING_CATEGORY_MASK_UTF_8)
4282 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4283 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4284 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4285 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4286 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4287 if (try & CODING_CATEGORY_MASK_CCL)
4288 mask |= detect_coding_ccl (src, src_end, multibytep);
4289 }
4290 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4291 }
4292
4293 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4294 The information of the detected coding system is set in CODING. */
4295
4296 void
4297 detect_coding (coding, src, src_bytes)
4298 struct coding_system *coding;
4299 const unsigned char *src;
4300 int src_bytes;
4301 {
4302 unsigned int idx;
4303 int skip, mask;
4304 Lisp_Object val;
4305
4306 val = Vcoding_category_list;
4307 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4308 coding->src_multibyte);
4309 coding->heading_ascii = skip;
4310
4311 if (!mask) return;
4312
4313 /* We found a single coding system of the highest priority in MASK. */
4314 idx = 0;
4315 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4316 if (! mask)
4317 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4318
4319 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4320
4321 if (coding->eol_type != CODING_EOL_UNDECIDED)
4322 {
4323 Lisp_Object tmp;
4324
4325 tmp = Fget (val, Qeol_type);
4326 if (VECTORP (tmp))
4327 val = XVECTOR (tmp)->contents[coding->eol_type];
4328 }
4329
4330 /* Setup this new coding system while preserving some slots. */
4331 {
4332 int src_multibyte = coding->src_multibyte;
4333 int dst_multibyte = coding->dst_multibyte;
4334
4335 setup_coding_system (val, coding);
4336 coding->src_multibyte = src_multibyte;
4337 coding->dst_multibyte = dst_multibyte;
4338 coding->heading_ascii = skip;
4339 }
4340 }
4341
4342 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4343 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4344 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4345
4346 How many non-eol characters are at the head is returned as *SKIP. */
4347
4348 #define MAX_EOL_CHECK_COUNT 3
4349
4350 static int
4351 detect_eol_type (source, src_bytes, skip)
4352 unsigned char *source;
4353 int src_bytes, *skip;
4354 {
4355 unsigned char *src = source, *src_end = src + src_bytes;
4356 unsigned char c;
4357 int total = 0; /* How many end-of-lines are found so far. */
4358 int eol_type = CODING_EOL_UNDECIDED;
4359 int this_eol_type;
4360
4361 *skip = 0;
4362
4363 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4364 {
4365 c = *src++;
4366 if (c == '\n' || c == '\r')
4367 {
4368 if (*skip == 0)
4369 *skip = src - 1 - source;
4370 total++;
4371 if (c == '\n')
4372 this_eol_type = CODING_EOL_LF;
4373 else if (src >= src_end || *src != '\n')
4374 this_eol_type = CODING_EOL_CR;
4375 else
4376 this_eol_type = CODING_EOL_CRLF, src++;
4377
4378 if (eol_type == CODING_EOL_UNDECIDED)
4379 /* This is the first end-of-line. */
4380 eol_type = this_eol_type;
4381 else if (eol_type != this_eol_type)
4382 {
4383 /* The found type is different from what found before. */
4384 eol_type = CODING_EOL_INCONSISTENT;
4385 break;
4386 }
4387 }
4388 }
4389
4390 if (*skip == 0)
4391 *skip = src_end - source;
4392 return eol_type;
4393 }
4394
4395 /* Like detect_eol_type, but detect EOL type in 2-octet
4396 big-endian/little-endian format for coding systems utf-16-be and
4397 utf-16-le. */
4398
4399 static int
4400 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4401 unsigned char *source;
4402 int src_bytes, *skip, big_endian_p;
4403 {
4404 unsigned char *src = source, *src_end = src + src_bytes;
4405 unsigned int c1, c2;
4406 int total = 0; /* How many end-of-lines are found so far. */
4407 int eol_type = CODING_EOL_UNDECIDED;
4408 int this_eol_type;
4409 int msb, lsb;
4410
4411 if (big_endian_p)
4412 msb = 0, lsb = 1;
4413 else
4414 msb = 1, lsb = 0;
4415
4416 *skip = 0;
4417
4418 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4419 {
4420 c1 = (src[msb] << 8) | (src[lsb]);
4421 src += 2;
4422
4423 if (c1 == '\n' || c1 == '\r')
4424 {
4425 if (*skip == 0)
4426 *skip = src - 2 - source;
4427 total++;
4428 if (c1 == '\n')
4429 {
4430 this_eol_type = CODING_EOL_LF;
4431 }
4432 else
4433 {
4434 if ((src + 1) >= src_end)
4435 {
4436 this_eol_type = CODING_EOL_CR;
4437 }
4438 else
4439 {
4440 c2 = (src[msb] << 8) | (src[lsb]);
4441 if (c2 == '\n')
4442 this_eol_type = CODING_EOL_CRLF, src += 2;
4443 else
4444 this_eol_type = CODING_EOL_CR;
4445 }
4446 }
4447
4448 if (eol_type == CODING_EOL_UNDECIDED)
4449 /* This is the first end-of-line. */
4450 eol_type = this_eol_type;
4451 else if (eol_type != this_eol_type)
4452 {
4453 /* The found type is different from what found before. */
4454 eol_type = CODING_EOL_INCONSISTENT;
4455 break;
4456 }
4457 }
4458 }
4459
4460 if (*skip == 0)
4461 *skip = src_end - source;
4462 return eol_type;
4463 }
4464
4465 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4466 is encoded. If it detects an appropriate format of end-of-line, it
4467 sets the information in *CODING. */
4468
4469 void
4470 detect_eol (coding, src, src_bytes)
4471 struct coding_system *coding;
4472 const unsigned char *src;
4473 int src_bytes;
4474 {
4475 Lisp_Object val;
4476 int skip;
4477 int eol_type;
4478
4479 switch (coding->category_idx)
4480 {
4481 case CODING_CATEGORY_IDX_UTF_16_BE:
4482 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4483 break;
4484 case CODING_CATEGORY_IDX_UTF_16_LE:
4485 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4486 break;
4487 default:
4488 eol_type = detect_eol_type (src, src_bytes, &skip);
4489 break;
4490 }
4491
4492 if (coding->heading_ascii > skip)
4493 coding->heading_ascii = skip;
4494 else
4495 skip = coding->heading_ascii;
4496
4497 if (eol_type == CODING_EOL_UNDECIDED)
4498 return;
4499 if (eol_type == CODING_EOL_INCONSISTENT)
4500 {
4501 #if 0
4502 /* This code is suppressed until we find a better way to
4503 distinguish raw text file and binary file. */
4504
4505 /* If we have already detected that the coding is raw-text, the
4506 coding should actually be no-conversion. */
4507 if (coding->type == coding_type_raw_text)
4508 {
4509 setup_coding_system (Qno_conversion, coding);
4510 return;
4511 }
4512 /* Else, let's decode only text code anyway. */
4513 #endif /* 0 */
4514 eol_type = CODING_EOL_LF;
4515 }
4516
4517 val = Fget (coding->symbol, Qeol_type);
4518 if (VECTORP (val) && XVECTOR (val)->size == 3)
4519 {
4520 int src_multibyte = coding->src_multibyte;
4521 int dst_multibyte = coding->dst_multibyte;
4522 struct composition_data *cmp_data = coding->cmp_data;
4523
4524 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4525 coding->src_multibyte = src_multibyte;
4526 coding->dst_multibyte = dst_multibyte;
4527 coding->heading_ascii = skip;
4528 coding->cmp_data = cmp_data;
4529 }
4530 }
4531
4532 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4533
4534 #define DECODING_BUFFER_MAG(coding) \
4535 (coding->type == coding_type_iso2022 \
4536 ? 3 \
4537 : (coding->type == coding_type_ccl \
4538 ? coding->spec.ccl.decoder.buf_magnification \
4539 : 2))
4540
4541 /* Return maximum size (bytes) of a buffer enough for decoding
4542 SRC_BYTES of text encoded in CODING. */
4543
4544 int
4545 decoding_buffer_size (coding, src_bytes)
4546 struct coding_system *coding;
4547 int src_bytes;
4548 {
4549 return (src_bytes * DECODING_BUFFER_MAG (coding)
4550 + CONVERSION_BUFFER_EXTRA_ROOM);
4551 }
4552
4553 /* Return maximum size (bytes) of a buffer enough for encoding
4554 SRC_BYTES of text to CODING. */
4555
4556 int
4557 encoding_buffer_size (coding, src_bytes)
4558 struct coding_system *coding;
4559 int src_bytes;
4560 {
4561 int magnification;
4562
4563 if (coding->type == coding_type_ccl)
4564 {
4565 magnification = coding->spec.ccl.encoder.buf_magnification;
4566 if (coding->eol_type == CODING_EOL_CRLF)
4567 magnification *= 2;
4568 }
4569 else if (CODING_REQUIRE_ENCODING (coding))
4570 magnification = 3;
4571 else
4572 magnification = 1;
4573
4574 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4575 }
4576
4577 /* Working buffer for code conversion. */
4578 struct conversion_buffer
4579 {
4580 int size; /* size of data. */
4581 int on_stack; /* 1 if allocated by alloca. */
4582 unsigned char *data;
4583 };
4584
4585 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4586 #define allocate_conversion_buffer(buf, len) \
4587 do { \
4588 if (len < MAX_ALLOCA) \
4589 { \
4590 buf.data = (unsigned char *) alloca (len); \
4591 buf.on_stack = 1; \
4592 } \
4593 else \
4594 { \
4595 buf.data = (unsigned char *) xmalloc (len); \
4596 buf.on_stack = 0; \
4597 } \
4598 buf.size = len; \
4599 } while (0)
4600
4601 /* Double the allocated memory for *BUF. */
4602 static void
4603 extend_conversion_buffer (buf)
4604 struct conversion_buffer *buf;
4605 {
4606 if (buf->on_stack)
4607 {
4608 unsigned char *save = buf->data;
4609 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4610 bcopy (save, buf->data, buf->size);
4611 buf->on_stack = 0;
4612 }
4613 else
4614 {
4615 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4616 }
4617 buf->size *= 2;
4618 }
4619
4620 /* Free the allocated memory for BUF if it is not on stack. */
4621 static void
4622 free_conversion_buffer (buf)
4623 struct conversion_buffer *buf;
4624 {
4625 if (!buf->on_stack)
4626 xfree (buf->data);
4627 }
4628
4629 int
4630 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4631 struct coding_system *coding;
4632 unsigned char *source, *destination;
4633 int src_bytes, dst_bytes, encodep;
4634 {
4635 struct ccl_program *ccl
4636 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4637 unsigned char *dst = destination;
4638
4639 ccl->suppress_error = coding->suppress_error;
4640 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4641 if (encodep)
4642 {
4643 /* On encoding, EOL format is converted within ccl_driver. For
4644 that, setup proper information in the structure CCL. */
4645 ccl->eol_type = coding->eol_type;
4646 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4647 ccl->eol_type = CODING_EOL_LF;
4648 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4649 ccl->eight_bit_control = coding->dst_multibyte;
4650 }
4651 else
4652 ccl->eight_bit_control = 1;
4653 ccl->multibyte = coding->src_multibyte;
4654 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4655 {
4656 /* Move carryover bytes to DESTINATION. */
4657 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4658 while (*p)
4659 *dst++ = *p++;
4660 coding->spec.ccl.eight_bit_carryover[0] = 0;
4661 if (dst_bytes)
4662 dst_bytes -= dst - destination;
4663 }
4664
4665 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4666 &(coding->consumed))
4667 + dst - destination);
4668
4669 if (encodep)
4670 {
4671 coding->produced_char = coding->produced;
4672 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4673 }
4674 else if (!ccl->eight_bit_control)
4675 {
4676 /* The produced bytes forms a valid multibyte sequence. */
4677 coding->produced_char
4678 = multibyte_chars_in_text (destination, coding->produced);
4679 coding->spec.ccl.eight_bit_carryover[0] = 0;
4680 }
4681 else
4682 {
4683 /* On decoding, the destination should always multibyte. But,
4684 CCL program might have been generated an invalid multibyte
4685 sequence. Here we make such a sequence valid as
4686 multibyte. */
4687 int bytes
4688 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4689
4690 if ((coding->consumed < src_bytes
4691 || !ccl->last_block)
4692 && coding->produced >= 1
4693 && destination[coding->produced - 1] >= 0x80)
4694 {
4695 /* We should not convert the tailing 8-bit codes to
4696 multibyte form even if they doesn't form a valid
4697 multibyte sequence. They may form a valid sequence in
4698 the next call. */
4699 int carryover = 0;
4700
4701 if (destination[coding->produced - 1] < 0xA0)
4702 carryover = 1;
4703 else if (coding->produced >= 2)
4704 {
4705 if (destination[coding->produced - 2] >= 0x80)
4706 {
4707 if (destination[coding->produced - 2] < 0xA0)
4708 carryover = 2;
4709 else if (coding->produced >= 3
4710 && destination[coding->produced - 3] >= 0x80
4711 && destination[coding->produced - 3] < 0xA0)
4712 carryover = 3;
4713 }
4714 }
4715 if (carryover > 0)
4716 {
4717 BCOPY_SHORT (destination + coding->produced - carryover,
4718 coding->spec.ccl.eight_bit_carryover,
4719 carryover);
4720 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4721 coding->produced -= carryover;
4722 }
4723 }
4724 coding->produced = str_as_multibyte (destination, bytes,
4725 coding->produced,
4726 &(coding->produced_char));
4727 }
4728
4729 switch (ccl->status)
4730 {
4731 case CCL_STAT_SUSPEND_BY_SRC:
4732 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4733 break;
4734 case CCL_STAT_SUSPEND_BY_DST:
4735 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4736 break;
4737 case CCL_STAT_QUIT:
4738 case CCL_STAT_INVALID_CMD:
4739 coding->result = CODING_FINISH_INTERRUPT;
4740 break;
4741 default:
4742 coding->result = CODING_FINISH_NORMAL;
4743 break;
4744 }
4745 return coding->result;
4746 }
4747
4748 /* Decode EOL format of the text at PTR of BYTES length destructively
4749 according to CODING->eol_type. This is called after the CCL
4750 program produced a decoded text at PTR. If we do CRLF->LF
4751 conversion, update CODING->produced and CODING->produced_char. */
4752
4753 static void
4754 decode_eol_post_ccl (coding, ptr, bytes)
4755 struct coding_system *coding;
4756 unsigned char *ptr;
4757 int bytes;
4758 {
4759 Lisp_Object val, saved_coding_symbol;
4760 unsigned char *pend = ptr + bytes;
4761 int dummy;
4762
4763 /* Remember the current coding system symbol. We set it back when
4764 an inconsistent EOL is found so that `last-coding-system-used' is
4765 set to the coding system that doesn't specify EOL conversion. */
4766 saved_coding_symbol = coding->symbol;
4767
4768 coding->spec.ccl.cr_carryover = 0;
4769 if (coding->eol_type == CODING_EOL_UNDECIDED)
4770 {
4771 /* Here, to avoid the call of setup_coding_system, we directly
4772 call detect_eol_type. */
4773 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4774 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4775 coding->eol_type = CODING_EOL_LF;
4776 if (coding->eol_type != CODING_EOL_UNDECIDED)
4777 {
4778 val = Fget (coding->symbol, Qeol_type);
4779 if (VECTORP (val) && XVECTOR (val)->size == 3)
4780 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4781 }
4782 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4783 }
4784
4785 if (coding->eol_type == CODING_EOL_LF
4786 || coding->eol_type == CODING_EOL_UNDECIDED)
4787 {
4788 /* We have nothing to do. */
4789 ptr = pend;
4790 }
4791 else if (coding->eol_type == CODING_EOL_CRLF)
4792 {
4793 unsigned char *pstart = ptr, *p = ptr;
4794
4795 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4796 && *(pend - 1) == '\r')
4797 {
4798 /* If the last character is CR, we can't handle it here
4799 because LF will be in the not-yet-decoded source text.
4800 Record that the CR is not yet processed. */
4801 coding->spec.ccl.cr_carryover = 1;
4802 coding->produced--;
4803 coding->produced_char--;
4804 pend--;
4805 }
4806 while (ptr < pend)
4807 {
4808 if (*ptr == '\r')
4809 {
4810 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4811 {
4812 *p++ = '\n';
4813 ptr += 2;
4814 }
4815 else
4816 {
4817 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4818 goto undo_eol_conversion;
4819 *p++ = *ptr++;
4820 }
4821 }
4822 else if (*ptr == '\n'
4823 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4824 goto undo_eol_conversion;
4825 else
4826 *p++ = *ptr++;
4827 continue;
4828
4829 undo_eol_conversion:
4830 /* We have faced with inconsistent EOL format at PTR.
4831 Convert all LFs before PTR back to CRLFs. */
4832 for (p--, ptr--; p >= pstart; p--)
4833 {
4834 if (*p == '\n')
4835 *ptr-- = '\n', *ptr-- = '\r';
4836 else
4837 *ptr-- = *p;
4838 }
4839 /* If carryover is recorded, cancel it because we don't
4840 convert CRLF anymore. */
4841 if (coding->spec.ccl.cr_carryover)
4842 {
4843 coding->spec.ccl.cr_carryover = 0;
4844 coding->produced++;
4845 coding->produced_char++;
4846 pend++;
4847 }
4848 p = ptr = pend;
4849 coding->eol_type = CODING_EOL_LF;
4850 coding->symbol = saved_coding_symbol;
4851 }
4852 if (p < pend)
4853 {
4854 /* As each two-byte sequence CRLF was converted to LF, (PEND
4855 - P) is the number of deleted characters. */
4856 coding->produced -= pend - p;
4857 coding->produced_char -= pend - p;
4858 }
4859 }
4860 else /* i.e. coding->eol_type == CODING_EOL_CR */
4861 {
4862 unsigned char *p = ptr;
4863
4864 for (; ptr < pend; ptr++)
4865 {
4866 if (*ptr == '\r')
4867 *ptr = '\n';
4868 else if (*ptr == '\n'
4869 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4870 {
4871 for (; p < ptr; p++)
4872 {
4873 if (*p == '\n')
4874 *p = '\r';
4875 }
4876 ptr = pend;
4877 coding->eol_type = CODING_EOL_LF;
4878 coding->symbol = saved_coding_symbol;
4879 }
4880 }
4881 }
4882 }
4883
4884 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4885 decoding, it may detect coding system and format of end-of-line if
4886 those are not yet decided. The source should be unibyte, the
4887 result is multibyte if CODING->dst_multibyte is nonzero, else
4888 unibyte. */
4889
4890 int
4891 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4892 struct coding_system *coding;
4893 const unsigned char *source;
4894 unsigned char *destination;
4895 int src_bytes, dst_bytes;
4896 {
4897 int extra = 0;
4898
4899 if (coding->type == coding_type_undecided)
4900 detect_coding (coding, source, src_bytes);
4901
4902 if (coding->eol_type == CODING_EOL_UNDECIDED
4903 && coding->type != coding_type_ccl)
4904 {
4905 detect_eol (coding, source, src_bytes);
4906 /* We had better recover the original eol format if we
4907 encounter an inconsistent eol format while decoding. */
4908 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4909 }
4910
4911 coding->produced = coding->produced_char = 0;
4912 coding->consumed = coding->consumed_char = 0;
4913 coding->errors = 0;
4914 coding->result = CODING_FINISH_NORMAL;
4915
4916 switch (coding->type)
4917 {
4918 case coding_type_sjis:
4919 decode_coding_sjis_big5 (coding, source, destination,
4920 src_bytes, dst_bytes, 1);
4921 break;
4922
4923 case coding_type_iso2022:
4924 decode_coding_iso2022 (coding, source, destination,
4925 src_bytes, dst_bytes);
4926 break;
4927
4928 case coding_type_big5:
4929 decode_coding_sjis_big5 (coding, source, destination,
4930 src_bytes, dst_bytes, 0);
4931 break;
4932
4933 case coding_type_emacs_mule:
4934 decode_coding_emacs_mule (coding, source, destination,
4935 src_bytes, dst_bytes);
4936 break;
4937
4938 case coding_type_ccl:
4939 if (coding->spec.ccl.cr_carryover)
4940 {
4941 /* Put the CR which was not processed by the previous call
4942 of decode_eol_post_ccl in DESTINATION. It will be
4943 decoded together with the following LF by the call to
4944 decode_eol_post_ccl below. */
4945 *destination = '\r';
4946 coding->produced++;
4947 coding->produced_char++;
4948 dst_bytes--;
4949 extra = coding->spec.ccl.cr_carryover;
4950 }
4951 ccl_coding_driver (coding, source, destination + extra,
4952 src_bytes, dst_bytes, 0);
4953 if (coding->eol_type != CODING_EOL_LF)
4954 {
4955 coding->produced += extra;
4956 coding->produced_char += extra;
4957 decode_eol_post_ccl (coding, destination, coding->produced);
4958 }
4959 break;
4960
4961 default:
4962 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4963 }
4964
4965 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4966 && coding->mode & CODING_MODE_LAST_BLOCK
4967 && coding->consumed == src_bytes)
4968 coding->result = CODING_FINISH_NORMAL;
4969
4970 if (coding->mode & CODING_MODE_LAST_BLOCK
4971 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4972 {
4973 const unsigned char *src = source + coding->consumed;
4974 unsigned char *dst = destination + coding->produced;
4975
4976 src_bytes -= coding->consumed;
4977 coding->errors++;
4978 if (COMPOSING_P (coding))
4979 DECODE_COMPOSITION_END ('1');
4980 while (src_bytes--)
4981 {
4982 int c = *src++;
4983 dst += CHAR_STRING (c, dst);
4984 coding->produced_char++;
4985 }
4986 coding->consumed = coding->consumed_char = src - source;
4987 coding->produced = dst - destination;
4988 coding->result = CODING_FINISH_NORMAL;
4989 }
4990
4991 if (!coding->dst_multibyte)
4992 {
4993 coding->produced = str_as_unibyte (destination, coding->produced);
4994 coding->produced_char = coding->produced;
4995 }
4996
4997 return coding->result;
4998 }
4999
5000 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
5001 multibyteness of the source is CODING->src_multibyte, the
5002 multibyteness of the result is always unibyte. */
5003
5004 int
5005 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5006 struct coding_system *coding;
5007 const unsigned char *source;
5008 unsigned char *destination;
5009 int src_bytes, dst_bytes;
5010 {
5011 coding->produced = coding->produced_char = 0;
5012 coding->consumed = coding->consumed_char = 0;
5013 coding->errors = 0;
5014 coding->result = CODING_FINISH_NORMAL;
5015 if (coding->eol_type == CODING_EOL_UNDECIDED)
5016 coding->eol_type = CODING_EOL_LF;
5017
5018 switch (coding->type)
5019 {
5020 case coding_type_sjis:
5021 encode_coding_sjis_big5 (coding, source, destination,
5022 src_bytes, dst_bytes, 1);
5023 break;
5024
5025 case coding_type_iso2022:
5026 encode_coding_iso2022 (coding, source, destination,
5027 src_bytes, dst_bytes);
5028 break;
5029
5030 case coding_type_big5:
5031 encode_coding_sjis_big5 (coding, source, destination,
5032 src_bytes, dst_bytes, 0);
5033 break;
5034
5035 case coding_type_emacs_mule:
5036 encode_coding_emacs_mule (coding, source, destination,
5037 src_bytes, dst_bytes);
5038 break;
5039
5040 case coding_type_ccl:
5041 ccl_coding_driver (coding, source, destination,
5042 src_bytes, dst_bytes, 1);
5043 break;
5044
5045 default:
5046 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5047 }
5048
5049 if (coding->mode & CODING_MODE_LAST_BLOCK
5050 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5051 {
5052 const unsigned char *src = source + coding->consumed;
5053 unsigned char *dst = destination + coding->produced;
5054
5055 if (coding->type == coding_type_iso2022)
5056 ENCODE_RESET_PLANE_AND_REGISTER;
5057 if (COMPOSING_P (coding))
5058 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5059 if (coding->consumed < src_bytes)
5060 {
5061 int len = src_bytes - coding->consumed;
5062
5063 BCOPY_SHORT (src, dst, len);
5064 if (coding->src_multibyte)
5065 len = str_as_unibyte (dst, len);
5066 dst += len;
5067 coding->consumed = src_bytes;
5068 }
5069 coding->produced = coding->produced_char = dst - destination;
5070 coding->result = CODING_FINISH_NORMAL;
5071 }
5072
5073 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5074 && coding->consumed == src_bytes)
5075 coding->result = CODING_FINISH_NORMAL;
5076
5077 return coding->result;
5078 }
5079
5080 /* Scan text in the region between *BEG and *END (byte positions),
5081 skip characters which we don't have to decode by coding system
5082 CODING at the head and tail, then set *BEG and *END to the region
5083 of the text we actually have to convert. The caller should move
5084 the gap out of the region in advance if the region is from a
5085 buffer.
5086
5087 If STR is not NULL, *BEG and *END are indices into STR. */
5088
5089 static void
5090 shrink_decoding_region (beg, end, coding, str)
5091 int *beg, *end;
5092 struct coding_system *coding;
5093 unsigned char *str;
5094 {
5095 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5096 int eol_conversion;
5097 Lisp_Object translation_table;
5098
5099 if (coding->type == coding_type_ccl
5100 || coding->type == coding_type_undecided
5101 || coding->eol_type != CODING_EOL_LF
5102 || !NILP (coding->post_read_conversion)
5103 || coding->composing != COMPOSITION_DISABLED)
5104 {
5105 /* We can't skip any data. */
5106 return;
5107 }
5108 if (coding->type == coding_type_no_conversion
5109 || coding->type == coding_type_raw_text
5110 || coding->type == coding_type_emacs_mule)
5111 {
5112 /* We need no conversion, but don't have to skip any data here.
5113 Decoding routine handles them effectively anyway. */
5114 return;
5115 }
5116
5117 translation_table = coding->translation_table_for_decode;
5118 if (NILP (translation_table) && !NILP (Venable_character_translation))
5119 translation_table = Vstandard_translation_table_for_decode;
5120 if (CHAR_TABLE_P (translation_table))
5121 {
5122 int i;
5123 for (i = 0; i < 128; i++)
5124 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5125 break;
5126 if (i < 128)
5127 /* Some ASCII character should be translated. We give up
5128 shrinking. */
5129 return;
5130 }
5131
5132 if (coding->heading_ascii >= 0)
5133 /* Detection routine has already found how much we can skip at the
5134 head. */
5135 *beg += coding->heading_ascii;
5136
5137 if (str)
5138 {
5139 begp_orig = begp = str + *beg;
5140 endp_orig = endp = str + *end;
5141 }
5142 else
5143 {
5144 begp_orig = begp = BYTE_POS_ADDR (*beg);
5145 endp_orig = endp = begp + *end - *beg;
5146 }
5147
5148 eol_conversion = (coding->eol_type == CODING_EOL_CR
5149 || coding->eol_type == CODING_EOL_CRLF);
5150
5151 switch (coding->type)
5152 {
5153 case coding_type_sjis:
5154 case coding_type_big5:
5155 /* We can skip all ASCII characters at the head. */
5156 if (coding->heading_ascii < 0)
5157 {
5158 if (eol_conversion)
5159 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5160 else
5161 while (begp < endp && *begp < 0x80) begp++;
5162 }
5163 /* We can skip all ASCII characters at the tail except for the
5164 second byte of SJIS or BIG5 code. */
5165 if (eol_conversion)
5166 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5167 else
5168 while (begp < endp && endp[-1] < 0x80) endp--;
5169 /* Do not consider LF as ascii if preceded by CR, since that
5170 confuses eol decoding. */
5171 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5172 endp++;
5173 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5174 endp++;
5175 break;
5176
5177 case coding_type_iso2022:
5178 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5179 /* We can't skip any data. */
5180 break;
5181 if (coding->heading_ascii < 0)
5182 {
5183 /* We can skip all ASCII characters at the head except for a
5184 few control codes. */
5185 while (begp < endp && (c = *begp) < 0x80
5186 && c != ISO_CODE_CR && c != ISO_CODE_SO
5187 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5188 && (!eol_conversion || c != ISO_CODE_LF))
5189 begp++;
5190 }
5191 switch (coding->category_idx)
5192 {
5193 case CODING_CATEGORY_IDX_ISO_8_1:
5194 case CODING_CATEGORY_IDX_ISO_8_2:
5195 /* We can skip all ASCII characters at the tail. */
5196 if (eol_conversion)
5197 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5198 else
5199 while (begp < endp && endp[-1] < 0x80) endp--;
5200 /* Do not consider LF as ascii if preceded by CR, since that
5201 confuses eol decoding. */
5202 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5203 endp++;
5204 break;
5205
5206 case CODING_CATEGORY_IDX_ISO_7:
5207 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5208 {
5209 /* We can skip all characters at the tail except for 8-bit
5210 codes and ESC and the following 2-byte at the tail. */
5211 unsigned char *eight_bit = NULL;
5212
5213 if (eol_conversion)
5214 while (begp < endp
5215 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5216 {
5217 if (!eight_bit && c & 0x80) eight_bit = endp;
5218 endp--;
5219 }
5220 else
5221 while (begp < endp
5222 && (c = endp[-1]) != ISO_CODE_ESC)
5223 {
5224 if (!eight_bit && c & 0x80) eight_bit = endp;
5225 endp--;
5226 }
5227 /* Do not consider LF as ascii if preceded by CR, since that
5228 confuses eol decoding. */
5229 if (begp < endp && endp < endp_orig
5230 && endp[-1] == '\r' && endp[0] == '\n')
5231 endp++;
5232 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5233 {
5234 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5235 /* This is an ASCII designation sequence. We can
5236 surely skip the tail. But, if we have
5237 encountered an 8-bit code, skip only the codes
5238 after that. */
5239 endp = eight_bit ? eight_bit : endp + 2;
5240 else
5241 /* Hmmm, we can't skip the tail. */
5242 endp = endp_orig;
5243 }
5244 else if (eight_bit)
5245 endp = eight_bit;
5246 }
5247 }
5248 break;
5249
5250 default:
5251 abort ();
5252 }
5253 *beg += begp - begp_orig;
5254 *end += endp - endp_orig;
5255 return;
5256 }
5257
5258 /* Like shrink_decoding_region but for encoding. */
5259
5260 static void
5261 shrink_encoding_region (beg, end, coding, str)
5262 int *beg, *end;
5263 struct coding_system *coding;
5264 unsigned char *str;
5265 {
5266 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5267 int eol_conversion;
5268 Lisp_Object translation_table;
5269
5270 if (coding->type == coding_type_ccl
5271 || coding->eol_type == CODING_EOL_CRLF
5272 || coding->eol_type == CODING_EOL_CR
5273 || (coding->cmp_data && coding->cmp_data->used > 0))
5274 {
5275 /* We can't skip any data. */
5276 return;
5277 }
5278 if (coding->type == coding_type_no_conversion
5279 || coding->type == coding_type_raw_text
5280 || coding->type == coding_type_emacs_mule
5281 || coding->type == coding_type_undecided)
5282 {
5283 /* We need no conversion, but don't have to skip any data here.
5284 Encoding routine handles them effectively anyway. */
5285 return;
5286 }
5287
5288 translation_table = coding->translation_table_for_encode;
5289 if (NILP (translation_table) && !NILP (Venable_character_translation))
5290 translation_table = Vstandard_translation_table_for_encode;
5291 if (CHAR_TABLE_P (translation_table))
5292 {
5293 int i;
5294 for (i = 0; i < 128; i++)
5295 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5296 break;
5297 if (i < 128)
5298 /* Some ASCII character should be translated. We give up
5299 shrinking. */
5300 return;
5301 }
5302
5303 if (str)
5304 {
5305 begp_orig = begp = str + *beg;
5306 endp_orig = endp = str + *end;
5307 }
5308 else
5309 {
5310 begp_orig = begp = BYTE_POS_ADDR (*beg);
5311 endp_orig = endp = begp + *end - *beg;
5312 }
5313
5314 eol_conversion = (coding->eol_type == CODING_EOL_CR
5315 || coding->eol_type == CODING_EOL_CRLF);
5316
5317 /* Here, we don't have to check coding->pre_write_conversion because
5318 the caller is expected to have handled it already. */
5319 switch (coding->type)
5320 {
5321 case coding_type_iso2022:
5322 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5323 /* We can't skip any data. */
5324 break;
5325 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5326 {
5327 unsigned char *bol = begp;
5328 while (begp < endp && *begp < 0x80)
5329 {
5330 begp++;
5331 if (begp[-1] == '\n')
5332 bol = begp;
5333 }
5334 begp = bol;
5335 goto label_skip_tail;
5336 }
5337 /* fall down ... */
5338
5339 case coding_type_sjis:
5340 case coding_type_big5:
5341 /* We can skip all ASCII characters at the head and tail. */
5342 if (eol_conversion)
5343 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5344 else
5345 while (begp < endp && *begp < 0x80) begp++;
5346 label_skip_tail:
5347 if (eol_conversion)
5348 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5349 else
5350 while (begp < endp && *(endp - 1) < 0x80) endp--;
5351 break;
5352
5353 default:
5354 abort ();
5355 }
5356
5357 *beg += begp - begp_orig;
5358 *end += endp - endp_orig;
5359 return;
5360 }
5361
5362 /* As shrinking conversion region requires some overhead, we don't try
5363 shrinking if the length of conversion region is less than this
5364 value. */
5365 static int shrink_conversion_region_threshhold = 1024;
5366
5367 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5368 do { \
5369 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5370 { \
5371 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5372 else shrink_decoding_region (beg, end, coding, str); \
5373 } \
5374 } while (0)
5375
5376 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5377 Vlast_coding_system_used and the remaining elements are buffers to
5378 kill. */
5379 static Lisp_Object
5380 code_convert_region_unwind (arg)
5381 Lisp_Object arg;
5382 {
5383 struct gcpro gcpro1;
5384 GCPRO1 (arg);
5385
5386 inhibit_pre_post_conversion = 0;
5387 Vlast_coding_system_used = XCAR (arg);
5388 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5389 Fkill_buffer (XCAR (arg));
5390
5391 UNGCPRO;
5392 return Qnil;
5393 }
5394
5395 /* Store information about all compositions in the range FROM and TO
5396 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5397 buffer or a string, defaults to the current buffer. */
5398
5399 void
5400 coding_save_composition (coding, from, to, obj)
5401 struct coding_system *coding;
5402 int from, to;
5403 Lisp_Object obj;
5404 {
5405 Lisp_Object prop;
5406 int start, end;
5407
5408 if (coding->composing == COMPOSITION_DISABLED)
5409 return;
5410 if (!coding->cmp_data)
5411 coding_allocate_composition_data (coding, from);
5412 if (!find_composition (from, to, &start, &end, &prop, obj)
5413 || end > to)
5414 return;
5415 if (start < from
5416 && (!find_composition (end, to, &start, &end, &prop, obj)
5417 || end > to))
5418 return;
5419 coding->composing = COMPOSITION_NO;
5420 do
5421 {
5422 if (COMPOSITION_VALID_P (start, end, prop))
5423 {
5424 enum composition_method method = COMPOSITION_METHOD (prop);
5425 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5426 >= COMPOSITION_DATA_SIZE)
5427 coding_allocate_composition_data (coding, from);
5428 /* For relative composition, we remember start and end
5429 positions, for the other compositions, we also remember
5430 components. */
5431 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5432 if (method != COMPOSITION_RELATIVE)
5433 {
5434 /* We must store a*/
5435 Lisp_Object val, ch;
5436
5437 val = COMPOSITION_COMPONENTS (prop);
5438 if (CONSP (val))
5439 while (CONSP (val))
5440 {
5441 ch = XCAR (val), val = XCDR (val);
5442 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5443 }
5444 else if (VECTORP (val) || STRINGP (val))
5445 {
5446 int len = (VECTORP (val)
5447 ? XVECTOR (val)->size : SCHARS (val));
5448 int i;
5449 for (i = 0; i < len; i++)
5450 {
5451 ch = (STRINGP (val)
5452 ? Faref (val, make_number (i))
5453 : XVECTOR (val)->contents[i]);
5454 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5455 }
5456 }
5457 else /* INTEGERP (val) */
5458 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5459 }
5460 CODING_ADD_COMPOSITION_END (coding, end - from);
5461 }
5462 start = end;
5463 }
5464 while (start < to
5465 && find_composition (start, to, &start, &end, &prop, obj)
5466 && end <= to);
5467
5468 /* Make coding->cmp_data point to the first memory block. */
5469 while (coding->cmp_data->prev)
5470 coding->cmp_data = coding->cmp_data->prev;
5471 coding->cmp_data_start = 0;
5472 }
5473
5474 /* Reflect the saved information about compositions to OBJ.
5475 CODING->cmp_data points to a memory block for the information. OBJ
5476 is a buffer or a string, defaults to the current buffer. */
5477
5478 void
5479 coding_restore_composition (coding, obj)
5480 struct coding_system *coding;
5481 Lisp_Object obj;
5482 {
5483 struct composition_data *cmp_data = coding->cmp_data;
5484
5485 if (!cmp_data)
5486 return;
5487
5488 while (cmp_data->prev)
5489 cmp_data = cmp_data->prev;
5490
5491 while (cmp_data)
5492 {
5493 int i;
5494
5495 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5496 i += cmp_data->data[i])
5497 {
5498 int *data = cmp_data->data + i;
5499 enum composition_method method = (enum composition_method) data[3];
5500 Lisp_Object components;
5501
5502 if (data[0] < 0 || i + data[0] > cmp_data->used)
5503 /* Invalid composition data. */
5504 break;
5505
5506 if (method == COMPOSITION_RELATIVE)
5507 components = Qnil;
5508 else
5509 {
5510 int len = data[0] - 4, j;
5511 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5512
5513 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5514 && len % 2 == 0)
5515 len --;
5516 if (len < 1)
5517 /* Invalid composition data. */
5518 break;
5519 for (j = 0; j < len; j++)
5520 args[j] = make_number (data[4 + j]);
5521 components = (method == COMPOSITION_WITH_ALTCHARS
5522 ? Fstring (len, args)
5523 : Fvector (len, args));
5524 }
5525 compose_text (data[1], data[2], components, Qnil, obj);
5526 }
5527 cmp_data = cmp_data->next;
5528 }
5529 }
5530
5531 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5532 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5533 coding system CODING, and return the status code of code conversion
5534 (currently, this value has no meaning).
5535
5536 How many characters (and bytes) are converted to how many
5537 characters (and bytes) are recorded in members of the structure
5538 CODING.
5539
5540 If REPLACE is nonzero, we do various things as if the original text
5541 is deleted and a new text is inserted. See the comments in
5542 replace_range (insdel.c) to know what we are doing.
5543
5544 If REPLACE is zero, it is assumed that the source text is unibyte.
5545 Otherwise, it is assumed that the source text is multibyte. */
5546
5547 int
5548 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5549 int from, from_byte, to, to_byte, encodep, replace;
5550 struct coding_system *coding;
5551 {
5552 int len = to - from, len_byte = to_byte - from_byte;
5553 int nchars_del = 0, nbytes_del = 0;
5554 int require, inserted, inserted_byte;
5555 int head_skip, tail_skip, total_skip = 0;
5556 Lisp_Object saved_coding_symbol;
5557 int first = 1;
5558 unsigned char *src, *dst;
5559 Lisp_Object deletion;
5560 int orig_point = PT, orig_len = len;
5561 int prev_Z;
5562 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5563
5564 deletion = Qnil;
5565 saved_coding_symbol = coding->symbol;
5566
5567 if (from < PT && PT < to)
5568 {
5569 TEMP_SET_PT_BOTH (from, from_byte);
5570 orig_point = from;
5571 }
5572
5573 if (replace)
5574 {
5575 int saved_from = from;
5576 int saved_inhibit_modification_hooks;
5577
5578 prepare_to_modify_buffer (from, to, &from);
5579 if (saved_from != from)
5580 {
5581 to = from + len;
5582 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5583 len_byte = to_byte - from_byte;
5584 }
5585
5586 /* The code conversion routine can not preserve text properties
5587 for now. So, we must remove all text properties in the
5588 region. Here, we must suppress all modification hooks. */
5589 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5590 inhibit_modification_hooks = 1;
5591 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5592 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5593 }
5594
5595 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5596 {
5597 /* We must detect encoding of text and eol format. */
5598
5599 if (from < GPT && to > GPT)
5600 move_gap_both (from, from_byte);
5601 if (coding->type == coding_type_undecided)
5602 {
5603 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5604 if (coding->type == coding_type_undecided)
5605 {
5606 /* It seems that the text contains only ASCII, but we
5607 should not leave it undecided because the deeper
5608 decoding routine (decode_coding) tries to detect the
5609 encodings again in vain. */
5610 coding->type = coding_type_emacs_mule;
5611 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5612 /* As emacs-mule decoder will handle composition, we
5613 need this setting to allocate coding->cmp_data
5614 later. */
5615 coding->composing = COMPOSITION_NO;
5616 }
5617 }
5618 if (coding->eol_type == CODING_EOL_UNDECIDED
5619 && coding->type != coding_type_ccl)
5620 {
5621 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5622 if (coding->eol_type == CODING_EOL_UNDECIDED)
5623 coding->eol_type = CODING_EOL_LF;
5624 /* We had better recover the original eol format if we
5625 encounter an inconsistent eol format while decoding. */
5626 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5627 }
5628 }
5629
5630 /* Now we convert the text. */
5631
5632 /* For encoding, we must process pre-write-conversion in advance. */
5633 if (! inhibit_pre_post_conversion
5634 && encodep
5635 && SYMBOLP (coding->pre_write_conversion)
5636 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5637 {
5638 /* The function in pre-write-conversion may put a new text in a
5639 new buffer. */
5640 struct buffer *prev = current_buffer;
5641 Lisp_Object new;
5642
5643 record_unwind_protect (code_convert_region_unwind,
5644 Fcons (Vlast_coding_system_used, Qnil));
5645 /* We should not call any more pre-write/post-read-conversion
5646 functions while this pre-write-conversion is running. */
5647 inhibit_pre_post_conversion = 1;
5648 call2 (coding->pre_write_conversion,
5649 make_number (from), make_number (to));
5650 inhibit_pre_post_conversion = 0;
5651 /* Discard the unwind protect. */
5652 specpdl_ptr--;
5653
5654 if (current_buffer != prev)
5655 {
5656 len = ZV - BEGV;
5657 new = Fcurrent_buffer ();
5658 set_buffer_internal_1 (prev);
5659 del_range_2 (from, from_byte, to, to_byte, 0);
5660 TEMP_SET_PT_BOTH (from, from_byte);
5661 insert_from_buffer (XBUFFER (new), 1, len, 0);
5662 Fkill_buffer (new);
5663 if (orig_point >= to)
5664 orig_point += len - orig_len;
5665 else if (orig_point > from)
5666 orig_point = from;
5667 orig_len = len;
5668 to = from + len;
5669 from_byte = CHAR_TO_BYTE (from);
5670 to_byte = CHAR_TO_BYTE (to);
5671 len_byte = to_byte - from_byte;
5672 TEMP_SET_PT_BOTH (from, from_byte);
5673 }
5674 }
5675
5676 if (replace)
5677 {
5678 if (! EQ (current_buffer->undo_list, Qt))
5679 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5680 else
5681 {
5682 nchars_del = to - from;
5683 nbytes_del = to_byte - from_byte;
5684 }
5685 }
5686
5687 if (coding->composing != COMPOSITION_DISABLED)
5688 {
5689 if (encodep)
5690 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5691 else
5692 coding_allocate_composition_data (coding, from);
5693 }
5694
5695 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5696 if we must run CCL program or there are compositions to
5697 encode. */
5698 if (coding->type != coding_type_ccl
5699 && (! coding->cmp_data || coding->cmp_data->used == 0))
5700 {
5701 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5702
5703 if (from < GPT && GPT < to)
5704 move_gap_both (from, from_byte);
5705 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5706 if (from_byte == to_byte
5707 && (encodep || NILP (coding->post_read_conversion))
5708 && ! CODING_REQUIRE_FLUSHING (coding))
5709 {
5710 coding->produced = len_byte;
5711 coding->produced_char = len;
5712 if (!replace)
5713 /* We must record and adjust for this new text now. */
5714 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5715 coding_free_composition_data (coding);
5716 return 0;
5717 }
5718
5719 head_skip = from_byte - from_byte_orig;
5720 tail_skip = to_byte_orig - to_byte;
5721 total_skip = head_skip + tail_skip;
5722 from += head_skip;
5723 to -= tail_skip;
5724 len -= total_skip; len_byte -= total_skip;
5725 }
5726
5727 /* For conversion, we must put the gap before the text in addition to
5728 making the gap larger for efficient decoding. The required gap
5729 size starts from 2000 which is the magic number used in make_gap.
5730 But, after one batch of conversion, it will be incremented if we
5731 find that it is not enough . */
5732 require = 2000;
5733
5734 if (GAP_SIZE < require)
5735 make_gap (require - GAP_SIZE);
5736 move_gap_both (from, from_byte);
5737
5738 inserted = inserted_byte = 0;
5739
5740 GAP_SIZE += len_byte;
5741 ZV -= len;
5742 Z -= len;
5743 ZV_BYTE -= len_byte;
5744 Z_BYTE -= len_byte;
5745
5746 if (GPT - BEG < BEG_UNCHANGED)
5747 BEG_UNCHANGED = GPT - BEG;
5748 if (Z - GPT < END_UNCHANGED)
5749 END_UNCHANGED = Z - GPT;
5750
5751 if (!encodep && coding->src_multibyte)
5752 {
5753 /* Decoding routines expects that the source text is unibyte.
5754 We must convert 8-bit characters of multibyte form to
5755 unibyte. */
5756 int len_byte_orig = len_byte;
5757 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5758 if (len_byte < len_byte_orig)
5759 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5760 len_byte);
5761 coding->src_multibyte = 0;
5762 }
5763
5764 for (;;)
5765 {
5766 int result;
5767
5768 /* The buffer memory is now:
5769 +--------+converted-text+---------+-------original-text-------+---+
5770 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5771 |<---------------------- GAP ----------------------->| */
5772 src = GAP_END_ADDR - len_byte;
5773 dst = GPT_ADDR + inserted_byte;
5774
5775 if (encodep)
5776 result = encode_coding (coding, src, dst, len_byte, 0);
5777 else
5778 {
5779 if (coding->composing != COMPOSITION_DISABLED)
5780 coding->cmp_data->char_offset = from + inserted;
5781 result = decode_coding (coding, src, dst, len_byte, 0);
5782 }
5783
5784 /* The buffer memory is now:
5785 +--------+-------converted-text----+--+------original-text----+---+
5786 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5787 |<---------------------- GAP ----------------------->| */
5788
5789 inserted += coding->produced_char;
5790 inserted_byte += coding->produced;
5791 len_byte -= coding->consumed;
5792
5793 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5794 {
5795 coding_allocate_composition_data (coding, from + inserted);
5796 continue;
5797 }
5798
5799 src += coding->consumed;
5800 dst += coding->produced;
5801
5802 if (result == CODING_FINISH_NORMAL)
5803 {
5804 src += len_byte;
5805 break;
5806 }
5807 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5808 {
5809 unsigned char *pend = dst, *p = pend - inserted_byte;
5810 Lisp_Object eol_type;
5811
5812 /* Encode LFs back to the original eol format (CR or CRLF). */
5813 if (coding->eol_type == CODING_EOL_CR)
5814 {
5815 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5816 }
5817 else
5818 {
5819 int count = 0;
5820
5821 while (p < pend) if (*p++ == '\n') count++;
5822 if (src - dst < count)
5823 {
5824 /* We don't have sufficient room for encoding LFs
5825 back to CRLF. We must record converted and
5826 not-yet-converted text back to the buffer
5827 content, enlarge the gap, then record them out of
5828 the buffer contents again. */
5829 int add = len_byte + inserted_byte;
5830
5831 GAP_SIZE -= add;
5832 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5833 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5834 make_gap (count - GAP_SIZE);
5835 GAP_SIZE += add;
5836 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5837 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5838 /* Don't forget to update SRC, DST, and PEND. */
5839 src = GAP_END_ADDR - len_byte;
5840 dst = GPT_ADDR + inserted_byte;
5841 pend = dst;
5842 }
5843 inserted += count;
5844 inserted_byte += count;
5845 coding->produced += count;
5846 p = dst = pend + count;
5847 while (count)
5848 {
5849 *--p = *--pend;
5850 if (*p == '\n') count--, *--p = '\r';
5851 }
5852 }
5853
5854 /* Suppress eol-format conversion in the further conversion. */
5855 coding->eol_type = CODING_EOL_LF;
5856
5857 /* Set the coding system symbol to that for Unix-like EOL. */
5858 eol_type = Fget (saved_coding_symbol, Qeol_type);
5859 if (VECTORP (eol_type)
5860 && XVECTOR (eol_type)->size == 3
5861 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5862 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5863 else
5864 coding->symbol = saved_coding_symbol;
5865
5866 continue;
5867 }
5868 if (len_byte <= 0)
5869 {
5870 if (coding->type != coding_type_ccl
5871 || coding->mode & CODING_MODE_LAST_BLOCK)
5872 break;
5873 coding->mode |= CODING_MODE_LAST_BLOCK;
5874 continue;
5875 }
5876 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5877 {
5878 /* The source text ends in invalid codes. Let's just
5879 make them valid buffer contents, and finish conversion. */
5880 if (multibyte_p)
5881 {
5882 unsigned char *start = dst;
5883
5884 inserted += len_byte;
5885 while (len_byte--)
5886 {
5887 int c = *src++;
5888 dst += CHAR_STRING (c, dst);
5889 }
5890
5891 inserted_byte += dst - start;
5892 }
5893 else
5894 {
5895 inserted += len_byte;
5896 inserted_byte += len_byte;
5897 while (len_byte--)
5898 *dst++ = *src++;
5899 }
5900 break;
5901 }
5902 if (result == CODING_FINISH_INTERRUPT)
5903 {
5904 /* The conversion procedure was interrupted by a user. */
5905 break;
5906 }
5907 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5908 if (coding->consumed < 1)
5909 {
5910 /* It's quite strange to require more memory without
5911 consuming any bytes. Perhaps CCL program bug. */
5912 break;
5913 }
5914 if (first)
5915 {
5916 /* We have just done the first batch of conversion which was
5917 stopped because of insufficient gap. Let's reconsider the
5918 required gap size (i.e. SRT - DST) now.
5919
5920 We have converted ORIG bytes (== coding->consumed) into
5921 NEW bytes (coding->produced). To convert the remaining
5922 LEN bytes, we may need REQUIRE bytes of gap, where:
5923 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5924 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5925 Here, we are sure that NEW >= ORIG. */
5926
5927 if (coding->produced <= coding->consumed)
5928 {
5929 /* This happens because of CCL-based coding system with
5930 eol-type CRLF. */
5931 require = 0;
5932 }
5933 else
5934 {
5935 float ratio = coding->produced - coding->consumed;
5936 ratio /= coding->consumed;
5937 require = len_byte * ratio;
5938 }
5939 first = 0;
5940 }
5941 if ((src - dst) < (require + 2000))
5942 {
5943 /* See the comment above the previous call of make_gap. */
5944 int add = len_byte + inserted_byte;
5945
5946 GAP_SIZE -= add;
5947 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5948 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5949 make_gap (require + 2000);
5950 GAP_SIZE += add;
5951 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5952 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5953 }
5954 }
5955 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5956
5957 if (encodep && coding->dst_multibyte)
5958 {
5959 /* The output is unibyte. We must convert 8-bit characters to
5960 multibyte form. */
5961 if (inserted_byte * 2 > GAP_SIZE)
5962 {
5963 GAP_SIZE -= inserted_byte;
5964 ZV += inserted_byte; Z += inserted_byte;
5965 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5966 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5967 make_gap (inserted_byte - GAP_SIZE);
5968 GAP_SIZE += inserted_byte;
5969 ZV -= inserted_byte; Z -= inserted_byte;
5970 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5971 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5972 }
5973 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5974 }
5975
5976 /* If we shrank the conversion area, adjust it now. */
5977 if (total_skip > 0)
5978 {
5979 if (tail_skip > 0)
5980 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5981 inserted += total_skip; inserted_byte += total_skip;
5982 GAP_SIZE += total_skip;
5983 GPT -= head_skip; GPT_BYTE -= head_skip;
5984 ZV -= total_skip; ZV_BYTE -= total_skip;
5985 Z -= total_skip; Z_BYTE -= total_skip;
5986 from -= head_skip; from_byte -= head_skip;
5987 to += tail_skip; to_byte += tail_skip;
5988 }
5989
5990 prev_Z = Z;
5991 if (! EQ (current_buffer->undo_list, Qt))
5992 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5993 else
5994 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5995 inserted, inserted_byte);
5996 inserted = Z - prev_Z;
5997
5998 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5999 coding_restore_composition (coding, Fcurrent_buffer ());
6000 coding_free_composition_data (coding);
6001
6002 if (! inhibit_pre_post_conversion
6003 && ! encodep && ! NILP (coding->post_read_conversion))
6004 {
6005 Lisp_Object val;
6006 Lisp_Object saved_coding_system;
6007
6008 if (from != PT)
6009 TEMP_SET_PT_BOTH (from, from_byte);
6010 prev_Z = Z;
6011 record_unwind_protect (code_convert_region_unwind,
6012 Fcons (Vlast_coding_system_used, Qnil));
6013 saved_coding_system = Vlast_coding_system_used;
6014 Vlast_coding_system_used = coding->symbol;
6015 /* We should not call any more pre-write/post-read-conversion
6016 functions while this post-read-conversion is running. */
6017 inhibit_pre_post_conversion = 1;
6018 val = call1 (coding->post_read_conversion, make_number (inserted));
6019 inhibit_pre_post_conversion = 0;
6020 coding->symbol = Vlast_coding_system_used;
6021 Vlast_coding_system_used = saved_coding_system;
6022 /* Discard the unwind protect. */
6023 specpdl_ptr--;
6024 CHECK_NUMBER (val);
6025 inserted += Z - prev_Z;
6026 }
6027
6028 if (orig_point >= from)
6029 {
6030 if (orig_point >= from + orig_len)
6031 orig_point += inserted - orig_len;
6032 else
6033 orig_point = from;
6034 TEMP_SET_PT (orig_point);
6035 }
6036
6037 if (replace)
6038 {
6039 signal_after_change (from, to - from, inserted);
6040 update_compositions (from, from + inserted, CHECK_BORDER);
6041 }
6042
6043 {
6044 coding->consumed = to_byte - from_byte;
6045 coding->consumed_char = to - from;
6046 coding->produced = inserted_byte;
6047 coding->produced_char = inserted;
6048 }
6049
6050 return 0;
6051 }
6052
6053 /* Name (or base name) of work buffer for code conversion. */
6054 static Lisp_Object Vcode_conversion_workbuf_name;
6055
6056 /* Set the current buffer to the working buffer prepared for
6057 code-conversion. MULTIBYTE specifies the multibyteness of the
6058 buffer. Return the buffer we set if it must be killed after use.
6059 Otherwise return Qnil. */
6060
6061 static Lisp_Object
6062 set_conversion_work_buffer (multibyte)
6063 int multibyte;
6064 {
6065 Lisp_Object buffer, buffer_to_kill;
6066 struct buffer *buf;
6067
6068 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6069 buf = XBUFFER (buffer);
6070 if (buf == current_buffer)
6071 {
6072 /* As we are already in the work buffer, we must generate a new
6073 buffer for the work. */
6074 Lisp_Object name;
6075
6076 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6077 buffer = buffer_to_kill = Fget_buffer_create (name);
6078 buf = XBUFFER (buffer);
6079 }
6080 else
6081 buffer_to_kill = Qnil;
6082
6083 delete_all_overlays (buf);
6084 buf->directory = current_buffer->directory;
6085 buf->read_only = Qnil;
6086 buf->filename = Qnil;
6087 buf->undo_list = Qt;
6088 eassert (buf->overlays_before == NULL);
6089 eassert (buf->overlays_after == NULL);
6090 set_buffer_internal (buf);
6091 if (BEG != BEGV || Z != ZV)
6092 Fwiden ();
6093 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6094 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6095 return buffer_to_kill;
6096 }
6097
6098 Lisp_Object
6099 run_pre_post_conversion_on_str (str, coding, encodep)
6100 Lisp_Object str;
6101 struct coding_system *coding;
6102 int encodep;
6103 {
6104 int count = SPECPDL_INDEX ();
6105 struct gcpro gcpro1, gcpro2;
6106 int multibyte = STRING_MULTIBYTE (str);
6107 Lisp_Object old_deactivate_mark;
6108 Lisp_Object buffer_to_kill;
6109 Lisp_Object unwind_arg;
6110
6111 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6112 /* It is not crucial to specbind this. */
6113 old_deactivate_mark = Vdeactivate_mark;
6114 GCPRO2 (str, old_deactivate_mark);
6115
6116 /* We must insert the contents of STR as is without
6117 unibyte<->multibyte conversion. For that, we adjust the
6118 multibyteness of the working buffer to that of STR. */
6119 buffer_to_kill = set_conversion_work_buffer (multibyte);
6120 if (NILP (buffer_to_kill))
6121 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6122 else
6123 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6124 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6125
6126 insert_from_string (str, 0, 0,
6127 SCHARS (str), SBYTES (str), 0);
6128 UNGCPRO;
6129 inhibit_pre_post_conversion = 1;
6130 if (encodep)
6131 {
6132 struct buffer *prev = current_buffer;
6133
6134 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6135 if (prev != current_buffer)
6136 /* We must kill the current buffer too. */
6137 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6138 }
6139 else
6140 {
6141 Vlast_coding_system_used = coding->symbol;
6142 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6143 call1 (coding->post_read_conversion, make_number (Z - BEG));
6144 coding->symbol = Vlast_coding_system_used;
6145 }
6146 inhibit_pre_post_conversion = 0;
6147 Vdeactivate_mark = old_deactivate_mark;
6148 str = make_buffer_string (BEG, Z, 1);
6149 return unbind_to (count, str);
6150 }
6151
6152
6153 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6154 text in *STR. *SIZE is the allocated bytes for STR. As it
6155 is intended that this function is called from encode_terminal_code,
6156 the pre-write-conversion function is run by safe_call and thus
6157 "Error during redisplay: ..." is logged when an error occurs.
6158
6159 Store the resulting text in *STR and set CODING->produced_char and
6160 CODING->produced to the number of characters and bytes
6161 respectively. If the size of *STR is too small, enlarge it by
6162 xrealloc and update *STR and *SIZE. */
6163
6164 void
6165 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6166 unsigned char **str;
6167 int *size, nchars, nbytes;
6168 struct coding_system *coding;
6169 {
6170 struct gcpro gcpro1, gcpro2;
6171 struct buffer *cur = current_buffer;
6172 struct buffer *prev;
6173 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6174 Lisp_Object args[3];
6175 Lisp_Object buffer_to_kill;
6176
6177 /* It is not crucial to specbind this. */
6178 old_deactivate_mark = Vdeactivate_mark;
6179 old_last_coding_system_used = Vlast_coding_system_used;
6180 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6181
6182 /* We must insert the contents of STR as is without
6183 unibyte<->multibyte conversion. For that, we adjust the
6184 multibyteness of the working buffer to that of STR. */
6185 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6186 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6187 UNGCPRO;
6188 inhibit_pre_post_conversion = 1;
6189 prev = current_buffer;
6190 args[0] = coding->pre_write_conversion;
6191 args[1] = make_number (BEG);
6192 args[2] = make_number (Z);
6193 safe_call (3, args);
6194 inhibit_pre_post_conversion = 0;
6195 Vdeactivate_mark = old_deactivate_mark;
6196 Vlast_coding_system_used = old_last_coding_system_used;
6197 coding->produced_char = Z - BEG;
6198 coding->produced = Z_BYTE - BEG_BYTE;
6199 if (coding->produced > *size)
6200 {
6201 *size = coding->produced;
6202 *str = xrealloc (*str, *size);
6203 }
6204 if (BEG < GPT && GPT < Z)
6205 move_gap (BEG);
6206 bcopy (BEG_ADDR, *str, coding->produced);
6207 coding->src_multibyte
6208 = ! NILP (current_buffer->enable_multibyte_characters);
6209 if (prev != current_buffer)
6210 Fkill_buffer (Fcurrent_buffer ());
6211 set_buffer_internal (cur);
6212 if (! NILP (buffer_to_kill))
6213 Fkill_buffer (buffer_to_kill);
6214 }
6215
6216
6217 Lisp_Object
6218 decode_coding_string (str, coding, nocopy)
6219 Lisp_Object str;
6220 struct coding_system *coding;
6221 int nocopy;
6222 {
6223 int len;
6224 struct conversion_buffer buf;
6225 int from, to_byte;
6226 Lisp_Object saved_coding_symbol;
6227 int result;
6228 int require_decoding;
6229 int shrinked_bytes = 0;
6230 Lisp_Object newstr;
6231 int consumed, consumed_char, produced, produced_char;
6232
6233 from = 0;
6234 to_byte = SBYTES (str);
6235
6236 saved_coding_symbol = coding->symbol;
6237 coding->src_multibyte = STRING_MULTIBYTE (str);
6238 coding->dst_multibyte = 1;
6239 if (CODING_REQUIRE_DETECTION (coding))
6240 {
6241 /* See the comments in code_convert_region. */
6242 if (coding->type == coding_type_undecided)
6243 {
6244 detect_coding (coding, SDATA (str), to_byte);
6245 if (coding->type == coding_type_undecided)
6246 {
6247 coding->type = coding_type_emacs_mule;
6248 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6249 /* As emacs-mule decoder will handle composition, we
6250 need this setting to allocate coding->cmp_data
6251 later. */
6252 coding->composing = COMPOSITION_NO;
6253 }
6254 }
6255 if (coding->eol_type == CODING_EOL_UNDECIDED
6256 && coding->type != coding_type_ccl)
6257 {
6258 saved_coding_symbol = coding->symbol;
6259 detect_eol (coding, SDATA (str), to_byte);
6260 if (coding->eol_type == CODING_EOL_UNDECIDED)
6261 coding->eol_type = CODING_EOL_LF;
6262 /* We had better recover the original eol format if we
6263 encounter an inconsistent eol format while decoding. */
6264 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6265 }
6266 }
6267
6268 if (coding->type == coding_type_no_conversion
6269 || coding->type == coding_type_raw_text)
6270 coding->dst_multibyte = 0;
6271
6272 require_decoding = CODING_REQUIRE_DECODING (coding);
6273
6274 if (STRING_MULTIBYTE (str))
6275 {
6276 /* Decoding routines expect the source text to be unibyte. */
6277 str = Fstring_as_unibyte (str);
6278 to_byte = SBYTES (str);
6279 nocopy = 1;
6280 coding->src_multibyte = 0;
6281 }
6282
6283 /* Try to skip the heading and tailing ASCIIs. */
6284 if (require_decoding && coding->type != coding_type_ccl)
6285 {
6286 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6287 0);
6288 if (from == to_byte)
6289 require_decoding = 0;
6290 shrinked_bytes = from + (SBYTES (str) - to_byte);
6291 }
6292
6293 if (!require_decoding
6294 && !(SYMBOLP (coding->post_read_conversion)
6295 && !NILP (Ffboundp (coding->post_read_conversion))))
6296 {
6297 coding->consumed = SBYTES (str);
6298 coding->consumed_char = SCHARS (str);
6299 if (coding->dst_multibyte)
6300 {
6301 str = Fstring_as_multibyte (str);
6302 nocopy = 1;
6303 }
6304 coding->produced = SBYTES (str);
6305 coding->produced_char = SCHARS (str);
6306 return (nocopy ? str : Fcopy_sequence (str));
6307 }
6308
6309 if (coding->composing != COMPOSITION_DISABLED)
6310 coding_allocate_composition_data (coding, from);
6311 len = decoding_buffer_size (coding, to_byte - from);
6312 allocate_conversion_buffer (buf, len);
6313
6314 consumed = consumed_char = produced = produced_char = 0;
6315 while (1)
6316 {
6317 result = decode_coding (coding, SDATA (str) + from + consumed,
6318 buf.data + produced, to_byte - from - consumed,
6319 buf.size - produced);
6320 consumed += coding->consumed;
6321 consumed_char += coding->consumed_char;
6322 produced += coding->produced;
6323 produced_char += coding->produced_char;
6324 if (result == CODING_FINISH_NORMAL
6325 || result == CODING_FINISH_INTERRUPT
6326 || (result == CODING_FINISH_INSUFFICIENT_SRC
6327 && coding->consumed == 0))
6328 break;
6329 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6330 coding_allocate_composition_data (coding, from + produced_char);
6331 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6332 extend_conversion_buffer (&buf);
6333 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6334 {
6335 Lisp_Object eol_type;
6336
6337 /* Recover the original EOL format. */
6338 if (coding->eol_type == CODING_EOL_CR)
6339 {
6340 unsigned char *p;
6341 for (p = buf.data; p < buf.data + produced; p++)
6342 if (*p == '\n') *p = '\r';
6343 }
6344 else if (coding->eol_type == CODING_EOL_CRLF)
6345 {
6346 int num_eol = 0;
6347 unsigned char *p0, *p1;
6348 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6349 if (*p0 == '\n') num_eol++;
6350 if (produced + num_eol >= buf.size)
6351 extend_conversion_buffer (&buf);
6352 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6353 {
6354 *--p1 = *--p0;
6355 if (*p0 == '\n') *--p1 = '\r';
6356 }
6357 produced += num_eol;
6358 produced_char += num_eol;
6359 }
6360 /* Suppress eol-format conversion in the further conversion. */
6361 coding->eol_type = CODING_EOL_LF;
6362
6363 /* Set the coding system symbol to that for Unix-like EOL. */
6364 eol_type = Fget (saved_coding_symbol, Qeol_type);
6365 if (VECTORP (eol_type)
6366 && XVECTOR (eol_type)->size == 3
6367 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6368 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6369 else
6370 coding->symbol = saved_coding_symbol;
6371
6372
6373 }
6374 }
6375
6376 coding->consumed = consumed;
6377 coding->consumed_char = consumed_char;
6378 coding->produced = produced;
6379 coding->produced_char = produced_char;
6380
6381 if (coding->dst_multibyte)
6382 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6383 produced + shrinked_bytes);
6384 else
6385 newstr = make_uninit_string (produced + shrinked_bytes);
6386 if (from > 0)
6387 STRING_COPYIN (newstr, 0, SDATA (str), from);
6388 STRING_COPYIN (newstr, from, buf.data, produced);
6389 if (shrinked_bytes > from)
6390 STRING_COPYIN (newstr, from + produced,
6391 SDATA (str) + to_byte,
6392 shrinked_bytes - from);
6393 free_conversion_buffer (&buf);
6394
6395 coding->consumed += shrinked_bytes;
6396 coding->consumed_char += shrinked_bytes;
6397 coding->produced += shrinked_bytes;
6398 coding->produced_char += shrinked_bytes;
6399
6400 if (coding->cmp_data && coding->cmp_data->used)
6401 coding_restore_composition (coding, newstr);
6402 coding_free_composition_data (coding);
6403
6404 if (SYMBOLP (coding->post_read_conversion)
6405 && !NILP (Ffboundp (coding->post_read_conversion)))
6406 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6407
6408 return newstr;
6409 }
6410
6411 Lisp_Object
6412 encode_coding_string (str, coding, nocopy)
6413 Lisp_Object str;
6414 struct coding_system *coding;
6415 int nocopy;
6416 {
6417 int len;
6418 struct conversion_buffer buf;
6419 int from, to, to_byte;
6420 int result;
6421 int shrinked_bytes = 0;
6422 Lisp_Object newstr;
6423 int consumed, consumed_char, produced, produced_char;
6424
6425 if (SYMBOLP (coding->pre_write_conversion)
6426 && !NILP (Ffboundp (coding->pre_write_conversion)))
6427 {
6428 str = run_pre_post_conversion_on_str (str, coding, 1);
6429 /* As STR is just newly generated, we don't have to copy it
6430 anymore. */
6431 nocopy = 1;
6432 }
6433
6434 from = 0;
6435 to = SCHARS (str);
6436 to_byte = SBYTES (str);
6437
6438 /* Encoding routines determine the multibyteness of the source text
6439 by coding->src_multibyte. */
6440 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6441 coding->dst_multibyte = 0;
6442 if (! CODING_REQUIRE_ENCODING (coding))
6443 goto no_need_of_encoding;
6444
6445 if (coding->composing != COMPOSITION_DISABLED)
6446 coding_save_composition (coding, from, to, str);
6447
6448 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6449 if we must run CCL program or there are compositions to
6450 encode. */
6451 if (coding->type != coding_type_ccl
6452 && (! coding->cmp_data || coding->cmp_data->used == 0))
6453 {
6454 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6455 1);
6456 if (from == to_byte)
6457 {
6458 coding_free_composition_data (coding);
6459 goto no_need_of_encoding;
6460 }
6461 shrinked_bytes = from + (SBYTES (str) - to_byte);
6462 }
6463
6464 len = encoding_buffer_size (coding, to_byte - from);
6465 allocate_conversion_buffer (buf, len);
6466
6467 consumed = consumed_char = produced = produced_char = 0;
6468 while (1)
6469 {
6470 result = encode_coding (coding, SDATA (str) + from + consumed,
6471 buf.data + produced, to_byte - from - consumed,
6472 buf.size - produced);
6473 consumed += coding->consumed;
6474 consumed_char += coding->consumed_char;
6475 produced += coding->produced;
6476 produced_char += coding->produced_char;
6477 if (result == CODING_FINISH_NORMAL
6478 || result == CODING_FINISH_INTERRUPT
6479 || (result == CODING_FINISH_INSUFFICIENT_SRC
6480 && coding->consumed == 0))
6481 break;
6482 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6483 extend_conversion_buffer (&buf);
6484 }
6485
6486 coding->consumed = consumed;
6487 coding->consumed_char = consumed_char;
6488 coding->produced = produced;
6489 coding->produced_char = produced_char;
6490
6491 newstr = make_uninit_string (produced + shrinked_bytes);
6492 if (from > 0)
6493 STRING_COPYIN (newstr, 0, SDATA (str), from);
6494 STRING_COPYIN (newstr, from, buf.data, produced);
6495 if (shrinked_bytes > from)
6496 STRING_COPYIN (newstr, from + produced,
6497 SDATA (str) + to_byte,
6498 shrinked_bytes - from);
6499
6500 free_conversion_buffer (&buf);
6501 coding_free_composition_data (coding);
6502
6503 return newstr;
6504
6505 no_need_of_encoding:
6506 coding->consumed = SBYTES (str);
6507 coding->consumed_char = SCHARS (str);
6508 if (STRING_MULTIBYTE (str))
6509 {
6510 if (nocopy)
6511 /* We are sure that STR doesn't contain a multibyte
6512 character. */
6513 STRING_SET_UNIBYTE (str);
6514 else
6515 {
6516 str = Fstring_as_unibyte (str);
6517 nocopy = 1;
6518 }
6519 }
6520 coding->produced = SBYTES (str);
6521 coding->produced_char = SCHARS (str);
6522 return (nocopy ? str : Fcopy_sequence (str));
6523 }
6524
6525 \f
6526 #ifdef emacs
6527 /*** 8. Emacs Lisp library functions ***/
6528
6529 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6530 doc: /* Return t if OBJECT is nil or a coding-system.
6531 See the documentation of `make-coding-system' for information
6532 about coding-system objects. */)
6533 (obj)
6534 Lisp_Object obj;
6535 {
6536 if (NILP (obj))
6537 return Qt;
6538 if (!SYMBOLP (obj))
6539 return Qnil;
6540 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6541 return Qt;
6542 /* Get coding-spec vector for OBJ. */
6543 obj = Fget (obj, Qcoding_system);
6544 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6545 ? Qt : Qnil);
6546 }
6547
6548 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6549 Sread_non_nil_coding_system, 1, 1, 0,
6550 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6551 (prompt)
6552 Lisp_Object prompt;
6553 {
6554 Lisp_Object val;
6555 do
6556 {
6557 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6558 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6559 }
6560 while (SCHARS (val) == 0);
6561 return (Fintern (val, Qnil));
6562 }
6563
6564 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6565 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6566 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6567 (prompt, default_coding_system)
6568 Lisp_Object prompt, default_coding_system;
6569 {
6570 Lisp_Object val;
6571 if (SYMBOLP (default_coding_system))
6572 default_coding_system = SYMBOL_NAME (default_coding_system);
6573 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6574 Qt, Qnil, Qcoding_system_history,
6575 default_coding_system, Qnil);
6576 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6577 }
6578
6579 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6580 1, 1, 0,
6581 doc: /* Check validity of CODING-SYSTEM.
6582 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6583 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6584 The value of this property should be a vector of length 5. */)
6585 (coding_system)
6586 Lisp_Object coding_system;
6587 {
6588 Lisp_Object define_form;
6589
6590 define_form = Fget (coding_system, Qcoding_system_define_form);
6591 if (! NILP (define_form))
6592 {
6593 Fput (coding_system, Qcoding_system_define_form, Qnil);
6594 safe_eval (define_form);
6595 }
6596 if (!NILP (Fcoding_system_p (coding_system)))
6597 return coding_system;
6598 xsignal1 (Qcoding_system_error, coding_system);
6599 }
6600 \f
6601 Lisp_Object
6602 detect_coding_system (src, src_bytes, highest, multibytep)
6603 const unsigned char *src;
6604 int src_bytes, highest;
6605 int multibytep;
6606 {
6607 int coding_mask, eol_type;
6608 Lisp_Object val, tmp;
6609 int dummy;
6610
6611 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6612 eol_type = detect_eol_type (src, src_bytes, &dummy);
6613 if (eol_type == CODING_EOL_INCONSISTENT)
6614 eol_type = CODING_EOL_UNDECIDED;
6615
6616 if (!coding_mask)
6617 {
6618 val = Qundecided;
6619 if (eol_type != CODING_EOL_UNDECIDED)
6620 {
6621 Lisp_Object val2;
6622 val2 = Fget (Qundecided, Qeol_type);
6623 if (VECTORP (val2))
6624 val = XVECTOR (val2)->contents[eol_type];
6625 }
6626 return (highest ? val : Fcons (val, Qnil));
6627 }
6628
6629 /* At first, gather possible coding systems in VAL. */
6630 val = Qnil;
6631 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6632 {
6633 Lisp_Object category_val, category_index;
6634
6635 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6636 category_val = Fsymbol_value (XCAR (tmp));
6637 if (!NILP (category_val)
6638 && NATNUMP (category_index)
6639 && (coding_mask & (1 << XFASTINT (category_index))))
6640 {
6641 val = Fcons (category_val, val);
6642 if (highest)
6643 break;
6644 }
6645 }
6646 if (!highest)
6647 val = Fnreverse (val);
6648
6649 /* Then, replace the elements with subsidiary coding systems. */
6650 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6651 {
6652 if (eol_type != CODING_EOL_UNDECIDED
6653 && eol_type != CODING_EOL_INCONSISTENT)
6654 {
6655 Lisp_Object eol;
6656 eol = Fget (XCAR (tmp), Qeol_type);
6657 if (VECTORP (eol))
6658 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6659 }
6660 }
6661 return (highest ? XCAR (val) : val);
6662 }
6663
6664 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6665 2, 3, 0,
6666 doc: /* Detect how the byte sequence in the region is encoded.
6667 Return a list of possible coding systems used on decoding a byte
6668 sequence containing the bytes in the region between START and END when
6669 the coding system `undecided' is specified. The list is ordered by
6670 priority decided in the current language environment.
6671
6672 If only ASCII characters are found, it returns a list of single element
6673 `undecided' or its subsidiary coding system according to a detected
6674 end-of-line format.
6675
6676 If optional argument HIGHEST is non-nil, return the coding system of
6677 highest priority. */)
6678 (start, end, highest)
6679 Lisp_Object start, end, highest;
6680 {
6681 int from, to;
6682 int from_byte, to_byte;
6683 int include_anchor_byte = 0;
6684
6685 CHECK_NUMBER_COERCE_MARKER (start);
6686 CHECK_NUMBER_COERCE_MARKER (end);
6687
6688 validate_region (&start, &end);
6689 from = XINT (start), to = XINT (end);
6690 from_byte = CHAR_TO_BYTE (from);
6691 to_byte = CHAR_TO_BYTE (to);
6692
6693 if (from < GPT && to >= GPT)
6694 move_gap_both (to, to_byte);
6695 /* If we an anchor byte `\0' follows the region, we include it in
6696 the detecting source. Then code detectors can handle the tailing
6697 byte sequence more accurately.
6698
6699 Fix me: This is not a perfect solution. It is better that we
6700 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6701 */
6702 if (to == Z || (to == GPT && GAP_SIZE > 0))
6703 include_anchor_byte = 1;
6704 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6705 to_byte - from_byte + include_anchor_byte,
6706 !NILP (highest),
6707 !NILP (current_buffer
6708 ->enable_multibyte_characters));
6709 }
6710
6711 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6712 1, 2, 0,
6713 doc: /* Detect how the byte sequence in STRING is encoded.
6714 Return a list of possible coding systems used on decoding a byte
6715 sequence containing the bytes in STRING when the coding system
6716 `undecided' is specified. The list is ordered by priority decided in
6717 the current language environment.
6718
6719 If only ASCII characters are found, it returns a list of single element
6720 `undecided' or its subsidiary coding system according to a detected
6721 end-of-line format.
6722
6723 If optional argument HIGHEST is non-nil, return the coding system of
6724 highest priority. */)
6725 (string, highest)
6726 Lisp_Object string, highest;
6727 {
6728 CHECK_STRING (string);
6729
6730 return detect_coding_system (SDATA (string),
6731 /* "+ 1" is to include the anchor byte
6732 `\0'. With this, code detectors can
6733 handle the tailing bytes more
6734 accurately. */
6735 SBYTES (string) + 1,
6736 !NILP (highest),
6737 STRING_MULTIBYTE (string));
6738 }
6739
6740 /* Subroutine for Ffind_coding_systems_region_internal.
6741
6742 Return a list of coding systems that safely encode the multibyte
6743 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6744 possible coding systems. If it is nil, it means that we have not
6745 yet found any coding systems.
6746
6747 WORK_TABLE a char-table of which element is set to t once the
6748 element is looked up.
6749
6750 If a non-ASCII single byte char is found, set
6751 *single_byte_char_found to 1. */
6752
6753 static Lisp_Object
6754 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6755 unsigned char *p, *pend;
6756 Lisp_Object safe_codings, work_table;
6757 int *single_byte_char_found;
6758 {
6759 int c, len;
6760 Lisp_Object val, ch;
6761 Lisp_Object prev, tail;
6762
6763 if (NILP (safe_codings))
6764 goto done_safe_codings;
6765 while (p < pend)
6766 {
6767 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6768 p += len;
6769 if (ASCII_BYTE_P (c))
6770 /* We can ignore ASCII characters here. */
6771 continue;
6772 if (SINGLE_BYTE_CHAR_P (c))
6773 *single_byte_char_found = 1;
6774 /* Check the safe coding systems for C. */
6775 ch = make_number (c);
6776 val = Faref (work_table, ch);
6777 if (EQ (val, Qt))
6778 /* This element was already checked. Ignore it. */
6779 continue;
6780 /* Remember that we checked this element. */
6781 Faset (work_table, ch, Qt);
6782
6783 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6784 {
6785 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6786 int encodable;
6787
6788 elt = XCAR (tail);
6789 if (CONSP (XCDR (elt)))
6790 {
6791 /* This entry has this format now:
6792 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6793 ACCEPT-LATIN-EXTRA ) */
6794 val = XCDR (elt);
6795 encodable = ! NILP (Faref (XCAR (val), ch));
6796 if (! encodable)
6797 {
6798 val = XCDR (val);
6799 translation_table = XCAR (val);
6800 hash_table = XCAR (XCDR (val));
6801 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6802 }
6803 }
6804 else
6805 {
6806 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6807 encodable = ! NILP (Faref (XCDR (elt), ch));
6808 if (! encodable)
6809 {
6810 /* Transform the format to:
6811 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6812 ACCEPT-LATIN-EXTRA ) */
6813 val = Fget (XCAR (elt), Qcoding_system);
6814 translation_table
6815 = Fplist_get (AREF (val, 3),
6816 Qtranslation_table_for_encode);
6817 if (SYMBOLP (translation_table))
6818 translation_table = Fget (translation_table,
6819 Qtranslation_table);
6820 hash_table
6821 = (CHAR_TABLE_P (translation_table)
6822 ? XCHAR_TABLE (translation_table)->extras[1]
6823 : Qnil);
6824 accept_latin_extra
6825 = ((EQ (AREF (val, 0), make_number (2))
6826 && VECTORP (AREF (val, 4)))
6827 ? AREF (AREF (val, 4), 16)
6828 : Qnil);
6829 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6830 translation_table, hash_table,
6831 accept_latin_extra));
6832 }
6833 }
6834
6835 if (! encodable
6836 && ((CHAR_TABLE_P (translation_table)
6837 && ! NILP (Faref (translation_table, ch)))
6838 || (HASH_TABLE_P (hash_table)
6839 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6840 || (SINGLE_BYTE_CHAR_P (c)
6841 && ! NILP (accept_latin_extra)
6842 && VECTORP (Vlatin_extra_code_table)
6843 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6844 encodable = 1;
6845 if (encodable)
6846 prev = tail;
6847 else
6848 {
6849 /* Exclude this coding system from SAFE_CODINGS. */
6850 if (EQ (tail, safe_codings))
6851 {
6852 safe_codings = XCDR (safe_codings);
6853 if (NILP (safe_codings))
6854 goto done_safe_codings;
6855 }
6856 else
6857 XSETCDR (prev, XCDR (tail));
6858 }
6859 }
6860 }
6861
6862 done_safe_codings:
6863 /* If the above loop was terminated before P reaches PEND, it means
6864 SAFE_CODINGS was set to nil. If we have not yet found an
6865 non-ASCII single-byte char, check it now. */
6866 if (! *single_byte_char_found)
6867 while (p < pend)
6868 {
6869 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6870 p += len;
6871 if (! ASCII_BYTE_P (c)
6872 && SINGLE_BYTE_CHAR_P (c))
6873 {
6874 *single_byte_char_found = 1;
6875 break;
6876 }
6877 }
6878 return safe_codings;
6879 }
6880
6881 DEFUN ("find-coding-systems-region-internal",
6882 Ffind_coding_systems_region_internal,
6883 Sfind_coding_systems_region_internal, 2, 2, 0,
6884 doc: /* Internal use only. */)
6885 (start, end)
6886 Lisp_Object start, end;
6887 {
6888 Lisp_Object work_table, safe_codings;
6889 int non_ascii_p = 0;
6890 int single_byte_char_found = 0;
6891 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6892
6893 if (STRINGP (start))
6894 {
6895 if (!STRING_MULTIBYTE (start))
6896 return Qt;
6897 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6898 p2 = p2end = p1end;
6899 if (SCHARS (start) != SBYTES (start))
6900 non_ascii_p = 1;
6901 }
6902 else
6903 {
6904 int from, to, stop;
6905
6906 CHECK_NUMBER_COERCE_MARKER (start);
6907 CHECK_NUMBER_COERCE_MARKER (end);
6908 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6909 args_out_of_range (start, end);
6910 if (NILP (current_buffer->enable_multibyte_characters))
6911 return Qt;
6912 from = CHAR_TO_BYTE (XINT (start));
6913 to = CHAR_TO_BYTE (XINT (end));
6914 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6915 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6916 if (stop == to)
6917 p2 = p2end = p1end;
6918 else
6919 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6920 if (XINT (end) - XINT (start) != to - from)
6921 non_ascii_p = 1;
6922 }
6923
6924 if (!non_ascii_p)
6925 {
6926 /* We are sure that the text contains no multibyte character.
6927 Check if it contains eight-bit-graphic. */
6928 p = p1;
6929 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6930 if (p == p1end)
6931 {
6932 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6933 if (p == p2end)
6934 return Qt;
6935 }
6936 }
6937
6938 /* The text contains non-ASCII characters. */
6939
6940 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6941 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6942
6943 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6944 &single_byte_char_found);
6945 if (p2 < p2end)
6946 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6947 &single_byte_char_found);
6948 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6949 safe_codings = Qt;
6950 else
6951 {
6952 /* Turn safe_codings to a list of coding systems... */
6953 Lisp_Object val;
6954
6955 if (single_byte_char_found)
6956 /* ... and append these for eight-bit chars. */
6957 val = Fcons (Qraw_text,
6958 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6959 else
6960 /* ... and append generic coding systems. */
6961 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6962
6963 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6964 val = Fcons (XCAR (XCAR (safe_codings)), val);
6965 safe_codings = val;
6966 }
6967
6968 return safe_codings;
6969 }
6970
6971
6972 /* Search from position POS for such characters that are unencodable
6973 accoding to SAFE_CHARS, and return a list of their positions. P
6974 points where in the memory the character at POS exists. Limit the
6975 search at PEND or when Nth unencodable characters are found.
6976
6977 If SAFE_CHARS is a char table, an element for an unencodable
6978 character is nil.
6979
6980 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6981
6982 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6983 eight-bit-graphic characters are unencodable. */
6984
6985 static Lisp_Object
6986 unencodable_char_position (safe_chars, pos, p, pend, n)
6987 Lisp_Object safe_chars;
6988 int pos;
6989 unsigned char *p, *pend;
6990 int n;
6991 {
6992 Lisp_Object pos_list;
6993
6994 pos_list = Qnil;
6995 while (p < pend)
6996 {
6997 int len;
6998 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6999
7000 if (c >= 128
7001 && (CHAR_TABLE_P (safe_chars)
7002 ? NILP (CHAR_TABLE_REF (safe_chars, c))
7003 : (NILP (safe_chars) || c < 256)))
7004 {
7005 pos_list = Fcons (make_number (pos), pos_list);
7006 if (--n <= 0)
7007 break;
7008 }
7009 pos++;
7010 p += len;
7011 }
7012 return Fnreverse (pos_list);
7013 }
7014
7015
7016 DEFUN ("unencodable-char-position", Funencodable_char_position,
7017 Sunencodable_char_position, 3, 5, 0,
7018 doc: /*
7019 Return position of first un-encodable character in a region.
7020 START and END specfiy the region and CODING-SYSTEM specifies the
7021 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7022
7023 If optional 4th argument COUNT is non-nil, it specifies at most how
7024 many un-encodable characters to search. In this case, the value is a
7025 list of positions.
7026
7027 If optional 5th argument STRING is non-nil, it is a string to search
7028 for un-encodable characters. In that case, START and END are indexes
7029 to the string. */)
7030 (start, end, coding_system, count, string)
7031 Lisp_Object start, end, coding_system, count, string;
7032 {
7033 int n;
7034 Lisp_Object safe_chars;
7035 struct coding_system coding;
7036 Lisp_Object positions;
7037 int from, to;
7038 unsigned char *p, *pend;
7039
7040 if (NILP (string))
7041 {
7042 validate_region (&start, &end);
7043 from = XINT (start);
7044 to = XINT (end);
7045 if (NILP (current_buffer->enable_multibyte_characters))
7046 return Qnil;
7047 p = CHAR_POS_ADDR (from);
7048 if (to == GPT)
7049 pend = GPT_ADDR;
7050 else
7051 pend = CHAR_POS_ADDR (to);
7052 }
7053 else
7054 {
7055 CHECK_STRING (string);
7056 CHECK_NATNUM (start);
7057 CHECK_NATNUM (end);
7058 from = XINT (start);
7059 to = XINT (end);
7060 if (from > to
7061 || to > SCHARS (string))
7062 args_out_of_range_3 (string, start, end);
7063 if (! STRING_MULTIBYTE (string))
7064 return Qnil;
7065 p = SDATA (string) + string_char_to_byte (string, from);
7066 pend = SDATA (string) + string_char_to_byte (string, to);
7067 }
7068
7069 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7070
7071 if (NILP (count))
7072 n = 1;
7073 else
7074 {
7075 CHECK_NATNUM (count);
7076 n = XINT (count);
7077 }
7078
7079 if (coding.type == coding_type_no_conversion
7080 || coding.type == coding_type_raw_text)
7081 return Qnil;
7082
7083 if (coding.type == coding_type_undecided)
7084 safe_chars = Qnil;
7085 else
7086 safe_chars = coding_safe_chars (coding_system);
7087
7088 if (STRINGP (string)
7089 || from >= GPT || to <= GPT)
7090 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7091 else
7092 {
7093 Lisp_Object args[2];
7094
7095 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7096 n -= XINT (Flength (args[0]));
7097 if (n <= 0)
7098 positions = args[0];
7099 else
7100 {
7101 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7102 pend, n);
7103 positions = Fappend (2, args);
7104 }
7105 }
7106
7107 return (NILP (count) ? Fcar (positions) : positions);
7108 }
7109
7110
7111 Lisp_Object
7112 code_convert_region1 (start, end, coding_system, encodep)
7113 Lisp_Object start, end, coding_system;
7114 int encodep;
7115 {
7116 struct coding_system coding;
7117 int from, to;
7118
7119 CHECK_NUMBER_COERCE_MARKER (start);
7120 CHECK_NUMBER_COERCE_MARKER (end);
7121 CHECK_SYMBOL (coding_system);
7122
7123 validate_region (&start, &end);
7124 from = XFASTINT (start);
7125 to = XFASTINT (end);
7126
7127 if (NILP (coding_system))
7128 return make_number (to - from);
7129
7130 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7131 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7132
7133 coding.mode |= CODING_MODE_LAST_BLOCK;
7134 coding.src_multibyte = coding.dst_multibyte
7135 = !NILP (current_buffer->enable_multibyte_characters);
7136 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7137 &coding, encodep, 1);
7138 Vlast_coding_system_used = coding.symbol;
7139 return make_number (coding.produced_char);
7140 }
7141
7142 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7143 3, 3, "r\nzCoding system: ",
7144 doc: /* Decode the current region from the specified coding system.
7145 When called from a program, takes three arguments:
7146 START, END, and CODING-SYSTEM. START and END are buffer positions.
7147 This function sets `last-coding-system-used' to the precise coding system
7148 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7149 not fully specified.)
7150 It returns the length of the decoded text. */)
7151 (start, end, coding_system)
7152 Lisp_Object start, end, coding_system;
7153 {
7154 return code_convert_region1 (start, end, coding_system, 0);
7155 }
7156
7157 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7158 3, 3, "r\nzCoding system: ",
7159 doc: /* Encode the current region into the specified coding system.
7160 When called from a program, takes three arguments:
7161 START, END, and CODING-SYSTEM. START and END are buffer positions.
7162 This function sets `last-coding-system-used' to the precise coding system
7163 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7164 not fully specified.)
7165 It returns the length of the encoded text. */)
7166 (start, end, coding_system)
7167 Lisp_Object start, end, coding_system;
7168 {
7169 return code_convert_region1 (start, end, coding_system, 1);
7170 }
7171
7172 Lisp_Object
7173 code_convert_string1 (string, coding_system, nocopy, encodep)
7174 Lisp_Object string, coding_system, nocopy;
7175 int encodep;
7176 {
7177 struct coding_system coding;
7178
7179 CHECK_STRING (string);
7180 CHECK_SYMBOL (coding_system);
7181
7182 if (NILP (coding_system))
7183 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7184
7185 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7186 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7187
7188 coding.mode |= CODING_MODE_LAST_BLOCK;
7189 string = (encodep
7190 ? encode_coding_string (string, &coding, !NILP (nocopy))
7191 : decode_coding_string (string, &coding, !NILP (nocopy)));
7192 Vlast_coding_system_used = coding.symbol;
7193
7194 return string;
7195 }
7196
7197 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7198 2, 3, 0,
7199 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7200 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7201 if the decoding operation is trivial.
7202 This function sets `last-coding-system-used' to the precise coding system
7203 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7204 not fully specified.) */)
7205 (string, coding_system, nocopy)
7206 Lisp_Object string, coding_system, nocopy;
7207 {
7208 return code_convert_string1 (string, coding_system, nocopy, 0);
7209 }
7210
7211 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7212 2, 3, 0,
7213 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7214 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7215 if the encoding operation is trivial.
7216 This function sets `last-coding-system-used' to the precise coding system
7217 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7218 not fully specified.) */)
7219 (string, coding_system, nocopy)
7220 Lisp_Object string, coding_system, nocopy;
7221 {
7222 return code_convert_string1 (string, coding_system, nocopy, 1);
7223 }
7224
7225 /* Encode or decode STRING according to CODING_SYSTEM.
7226 Do not set Vlast_coding_system_used.
7227
7228 This function is called only from macros DECODE_FILE and
7229 ENCODE_FILE, thus we ignore character composition. */
7230
7231 Lisp_Object
7232 code_convert_string_norecord (string, coding_system, encodep)
7233 Lisp_Object string, coding_system;
7234 int encodep;
7235 {
7236 struct coding_system coding;
7237
7238 CHECK_STRING (string);
7239 CHECK_SYMBOL (coding_system);
7240
7241 if (NILP (coding_system))
7242 return string;
7243
7244 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7245 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7246
7247 coding.composing = COMPOSITION_DISABLED;
7248 coding.mode |= CODING_MODE_LAST_BLOCK;
7249 return (encodep
7250 ? encode_coding_string (string, &coding, 1)
7251 : decode_coding_string (string, &coding, 1));
7252 }
7253 \f
7254 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7255 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7256 Return the corresponding character. */)
7257 (code)
7258 Lisp_Object code;
7259 {
7260 unsigned char c1, c2, s1, s2;
7261 Lisp_Object val;
7262
7263 CHECK_NUMBER (code);
7264 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7265 if (s1 == 0)
7266 {
7267 if (s2 < 0x80)
7268 XSETFASTINT (val, s2);
7269 else if (s2 >= 0xA0 || s2 <= 0xDF)
7270 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7271 else
7272 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7273 }
7274 else
7275 {
7276 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7277 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7278 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7279 DECODE_SJIS (s1, s2, c1, c2);
7280 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7281 }
7282 return val;
7283 }
7284
7285 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7286 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7287 Return the corresponding code in SJIS. */)
7288 (ch)
7289 Lisp_Object ch;
7290 {
7291 int charset, c1, c2, s1, s2;
7292 Lisp_Object val;
7293
7294 CHECK_NUMBER (ch);
7295 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7296 if (charset == CHARSET_ASCII)
7297 {
7298 val = ch;
7299 }
7300 else if (charset == charset_jisx0208
7301 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7302 {
7303 ENCODE_SJIS (c1, c2, s1, s2);
7304 XSETFASTINT (val, (s1 << 8) | s2);
7305 }
7306 else if (charset == charset_katakana_jisx0201
7307 && c1 > 0x20 && c2 < 0xE0)
7308 {
7309 XSETFASTINT (val, c1 | 0x80);
7310 }
7311 else
7312 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7313 return val;
7314 }
7315
7316 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7317 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7318 Return the corresponding character. */)
7319 (code)
7320 Lisp_Object code;
7321 {
7322 int charset;
7323 unsigned char b1, b2, c1, c2;
7324 Lisp_Object val;
7325
7326 CHECK_NUMBER (code);
7327 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7328 if (b1 == 0)
7329 {
7330 if (b2 >= 0x80)
7331 error ("Invalid BIG5 code: %x", XFASTINT (code));
7332 val = code;
7333 }
7334 else
7335 {
7336 if ((b1 < 0xA1 || b1 > 0xFE)
7337 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7338 error ("Invalid BIG5 code: %x", XFASTINT (code));
7339 DECODE_BIG5 (b1, b2, charset, c1, c2);
7340 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7341 }
7342 return val;
7343 }
7344
7345 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7346 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7347 Return the corresponding character code in Big5. */)
7348 (ch)
7349 Lisp_Object ch;
7350 {
7351 int charset, c1, c2, b1, b2;
7352 Lisp_Object val;
7353
7354 CHECK_NUMBER (ch);
7355 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7356 if (charset == CHARSET_ASCII)
7357 {
7358 val = ch;
7359 }
7360 else if ((charset == charset_big5_1
7361 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7362 || (charset == charset_big5_2
7363 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7364 {
7365 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7366 XSETFASTINT (val, (b1 << 8) | b2);
7367 }
7368 else
7369 error ("Can't encode to Big5: %d", XFASTINT (ch));
7370 return val;
7371 }
7372 \f
7373 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7374 Sset_terminal_coding_system_internal, 1, 1, 0,
7375 doc: /* Internal use only. */)
7376 (coding_system)
7377 Lisp_Object coding_system;
7378 {
7379 CHECK_SYMBOL (coding_system);
7380 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7381 /* We had better not send unsafe characters to terminal. */
7382 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7383 /* Character composition should be disabled. */
7384 terminal_coding.composing = COMPOSITION_DISABLED;
7385 /* Error notification should be suppressed. */
7386 terminal_coding.suppress_error = 1;
7387 terminal_coding.src_multibyte = 1;
7388 terminal_coding.dst_multibyte = 0;
7389 return Qnil;
7390 }
7391
7392 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7393 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7394 doc: /* Internal use only. */)
7395 (coding_system)
7396 Lisp_Object coding_system;
7397 {
7398 CHECK_SYMBOL (coding_system);
7399 setup_coding_system (Fcheck_coding_system (coding_system),
7400 &safe_terminal_coding);
7401 /* Character composition should be disabled. */
7402 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7403 /* Error notification should be suppressed. */
7404 safe_terminal_coding.suppress_error = 1;
7405 safe_terminal_coding.src_multibyte = 1;
7406 safe_terminal_coding.dst_multibyte = 0;
7407 return Qnil;
7408 }
7409
7410 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7411 Sterminal_coding_system, 0, 0, 0,
7412 doc: /* Return coding system specified for terminal output. */)
7413 ()
7414 {
7415 return terminal_coding.symbol;
7416 }
7417
7418 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7419 Sset_keyboard_coding_system_internal, 1, 1, 0,
7420 doc: /* Internal use only. */)
7421 (coding_system)
7422 Lisp_Object coding_system;
7423 {
7424 CHECK_SYMBOL (coding_system);
7425 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7426 /* Character composition should be disabled. */
7427 keyboard_coding.composing = COMPOSITION_DISABLED;
7428 return Qnil;
7429 }
7430
7431 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7432 Skeyboard_coding_system, 0, 0, 0,
7433 doc: /* Return coding system specified for decoding keyboard input. */)
7434 ()
7435 {
7436 return keyboard_coding.symbol;
7437 }
7438
7439 \f
7440 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7441 Sfind_operation_coding_system, 1, MANY, 0,
7442 doc: /* Choose a coding system for an operation based on the target name.
7443 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7444 DECODING-SYSTEM is the coding system to use for decoding
7445 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7446 for encoding (in case OPERATION does encoding).
7447
7448 The first argument OPERATION specifies an I/O primitive:
7449 For file I/O, `insert-file-contents' or `write-region'.
7450 For process I/O, `call-process', `call-process-region', or `start-process'.
7451 For network I/O, `open-network-stream'.
7452
7453 The remaining arguments should be the same arguments that were passed
7454 to the primitive. Depending on which primitive, one of those arguments
7455 is selected as the TARGET. For example, if OPERATION does file I/O,
7456 whichever argument specifies the file name is TARGET.
7457
7458 TARGET has a meaning which depends on OPERATION:
7459 For file I/O, TARGET is a file name (except for the special case below).
7460 For process I/O, TARGET is a process name.
7461 For network I/O, TARGET is a service name or a port number
7462
7463 This function looks up what specified for TARGET in,
7464 `file-coding-system-alist', `process-coding-system-alist',
7465 or `network-coding-system-alist' depending on OPERATION.
7466 They may specify a coding system, a cons of coding systems,
7467 or a function symbol to call.
7468 In the last case, we call the function with one argument,
7469 which is a list of all the arguments given to this function.
7470
7471 If OPERATION is `insert-file-contents', the argument corresponding to
7472 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7473 file name to look up, and BUFFER is a buffer that contains the file's
7474 contents (not yet decoded). If `file-coding-system-alist' specifies a
7475 function to call for FILENAME, that function should examine the
7476 contents of BUFFER instead of reading the file.
7477
7478 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7479 (nargs, args)
7480 int nargs;
7481 Lisp_Object *args;
7482 {
7483 Lisp_Object operation, target_idx, target, val;
7484 register Lisp_Object chain;
7485
7486 if (nargs < 2)
7487 error ("Too few arguments");
7488 operation = args[0];
7489 if (!SYMBOLP (operation)
7490 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7491 error ("Invalid first argument");
7492 if (nargs < 1 + XINT (target_idx))
7493 error ("Too few arguments for operation: %s",
7494 SDATA (SYMBOL_NAME (operation)));
7495 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7496 argument to write-region) is string, it must be treated as a
7497 target file name. */
7498 if (EQ (operation, Qwrite_region)
7499 && nargs > 5
7500 && STRINGP (args[5]))
7501 target_idx = make_number (4);
7502 target = args[XINT (target_idx) + 1];
7503 if (!(STRINGP (target)
7504 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7505 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7506 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7507 error ("Invalid argument %d", XINT (target_idx) + 1);
7508 if (CONSP (target))
7509 target = XCAR (target);
7510
7511 chain = ((EQ (operation, Qinsert_file_contents)
7512 || EQ (operation, Qwrite_region))
7513 ? Vfile_coding_system_alist
7514 : (EQ (operation, Qopen_network_stream)
7515 ? Vnetwork_coding_system_alist
7516 : Vprocess_coding_system_alist));
7517 if (NILP (chain))
7518 return Qnil;
7519
7520 for (; CONSP (chain); chain = XCDR (chain))
7521 {
7522 Lisp_Object elt;
7523 elt = XCAR (chain);
7524
7525 if (CONSP (elt)
7526 && ((STRINGP (target)
7527 && STRINGP (XCAR (elt))
7528 && fast_string_match (XCAR (elt), target) >= 0)
7529 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7530 {
7531 val = XCDR (elt);
7532 /* Here, if VAL is both a valid coding system and a valid
7533 function symbol, we return VAL as a coding system. */
7534 if (CONSP (val))
7535 return val;
7536 if (! SYMBOLP (val))
7537 return Qnil;
7538 if (! NILP (Fcoding_system_p (val)))
7539 return Fcons (val, val);
7540 if (! NILP (Ffboundp (val)))
7541 {
7542 val = safe_call1 (val, Flist (nargs, args));
7543 if (CONSP (val))
7544 return val;
7545 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7546 return Fcons (val, val);
7547 }
7548 return Qnil;
7549 }
7550 }
7551 return Qnil;
7552 }
7553
7554 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7555 Supdate_coding_systems_internal, 0, 0, 0,
7556 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7557 When values of any coding categories are changed, you must
7558 call this function. */)
7559 ()
7560 {
7561 int i;
7562
7563 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7564 {
7565 Lisp_Object val;
7566
7567 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7568 if (!NILP (val))
7569 {
7570 if (! coding_system_table[i])
7571 coding_system_table[i] = ((struct coding_system *)
7572 xmalloc (sizeof (struct coding_system)));
7573 setup_coding_system (val, coding_system_table[i]);
7574 }
7575 else if (coding_system_table[i])
7576 {
7577 xfree (coding_system_table[i]);
7578 coding_system_table[i] = NULL;
7579 }
7580 }
7581
7582 return Qnil;
7583 }
7584
7585 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7586 Sset_coding_priority_internal, 0, 0, 0,
7587 doc: /* Update internal database for the current value of `coding-category-list'.
7588 This function is internal use only. */)
7589 ()
7590 {
7591 int i = 0, idx;
7592 Lisp_Object val;
7593
7594 val = Vcoding_category_list;
7595
7596 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7597 {
7598 if (! SYMBOLP (XCAR (val)))
7599 break;
7600 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7601 if (idx >= CODING_CATEGORY_IDX_MAX)
7602 break;
7603 coding_priorities[i++] = (1 << idx);
7604 val = XCDR (val);
7605 }
7606 /* If coding-category-list is valid and contains all coding
7607 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7608 the following code saves Emacs from crashing. */
7609 while (i < CODING_CATEGORY_IDX_MAX)
7610 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7611
7612 return Qnil;
7613 }
7614
7615 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7616 Sdefine_coding_system_internal, 1, 1, 0,
7617 doc: /* Register CODING-SYSTEM as a base coding system.
7618 This function is internal use only. */)
7619 (coding_system)
7620 Lisp_Object coding_system;
7621 {
7622 Lisp_Object safe_chars, slot;
7623
7624 if (NILP (Fcheck_coding_system (coding_system)))
7625 xsignal1 (Qcoding_system_error, coding_system);
7626
7627 safe_chars = coding_safe_chars (coding_system);
7628 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7629 error ("No valid safe-chars property for %s",
7630 SDATA (SYMBOL_NAME (coding_system)));
7631
7632 if (EQ (safe_chars, Qt))
7633 {
7634 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7635 XSETCAR (Vcoding_system_safe_chars,
7636 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7637 }
7638 else
7639 {
7640 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7641 if (NILP (slot))
7642 XSETCDR (Vcoding_system_safe_chars,
7643 nconc2 (XCDR (Vcoding_system_safe_chars),
7644 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7645 else
7646 XSETCDR (slot, safe_chars);
7647 }
7648 return Qnil;
7649 }
7650
7651 #endif /* emacs */
7652
7653 \f
7654 /*** 9. Post-amble ***/
7655
7656 void
7657 init_coding_once ()
7658 {
7659 int i;
7660
7661 /* Emacs' internal format specific initialize routine. */
7662 for (i = 0; i <= 0x20; i++)
7663 emacs_code_class[i] = EMACS_control_code;
7664 emacs_code_class[0x0A] = EMACS_linefeed_code;
7665 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7666 for (i = 0x21 ; i < 0x7F; i++)
7667 emacs_code_class[i] = EMACS_ascii_code;
7668 emacs_code_class[0x7F] = EMACS_control_code;
7669 for (i = 0x80; i < 0xFF; i++)
7670 emacs_code_class[i] = EMACS_invalid_code;
7671 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7672 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7673 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7674 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7675
7676 /* ISO2022 specific initialize routine. */
7677 for (i = 0; i < 0x20; i++)
7678 iso_code_class[i] = ISO_control_0;
7679 for (i = 0x21; i < 0x7F; i++)
7680 iso_code_class[i] = ISO_graphic_plane_0;
7681 for (i = 0x80; i < 0xA0; i++)
7682 iso_code_class[i] = ISO_control_1;
7683 for (i = 0xA1; i < 0xFF; i++)
7684 iso_code_class[i] = ISO_graphic_plane_1;
7685 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7686 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7687 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7688 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7689 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7690 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7691 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7692 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7693 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7694 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7695
7696 setup_coding_system (Qnil, &keyboard_coding);
7697 setup_coding_system (Qnil, &terminal_coding);
7698 setup_coding_system (Qnil, &safe_terminal_coding);
7699 setup_coding_system (Qnil, &default_buffer_file_coding);
7700
7701 bzero (coding_system_table, sizeof coding_system_table);
7702
7703 bzero (ascii_skip_code, sizeof ascii_skip_code);
7704 for (i = 0; i < 128; i++)
7705 ascii_skip_code[i] = 1;
7706
7707 #if defined (MSDOS) || defined (WINDOWSNT)
7708 system_eol_type = CODING_EOL_CRLF;
7709 #else
7710 system_eol_type = CODING_EOL_LF;
7711 #endif
7712
7713 inhibit_pre_post_conversion = 0;
7714 }
7715
7716 #ifdef emacs
7717
7718 void
7719 syms_of_coding ()
7720 {
7721 staticpro (&Vcode_conversion_workbuf_name);
7722 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7723
7724 Qtarget_idx = intern ("target-idx");
7725 staticpro (&Qtarget_idx);
7726
7727 Qcoding_system_history = intern ("coding-system-history");
7728 staticpro (&Qcoding_system_history);
7729 Fset (Qcoding_system_history, Qnil);
7730
7731 /* Target FILENAME is the first argument. */
7732 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7733 /* Target FILENAME is the third argument. */
7734 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7735
7736 Qcall_process = intern ("call-process");
7737 staticpro (&Qcall_process);
7738 /* Target PROGRAM is the first argument. */
7739 Fput (Qcall_process, Qtarget_idx, make_number (0));
7740
7741 Qcall_process_region = intern ("call-process-region");
7742 staticpro (&Qcall_process_region);
7743 /* Target PROGRAM is the third argument. */
7744 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7745
7746 Qstart_process = intern ("start-process");
7747 staticpro (&Qstart_process);
7748 /* Target PROGRAM is the third argument. */
7749 Fput (Qstart_process, Qtarget_idx, make_number (2));
7750
7751 Qopen_network_stream = intern ("open-network-stream");
7752 staticpro (&Qopen_network_stream);
7753 /* Target SERVICE is the fourth argument. */
7754 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7755
7756 Qcoding_system = intern ("coding-system");
7757 staticpro (&Qcoding_system);
7758
7759 Qeol_type = intern ("eol-type");
7760 staticpro (&Qeol_type);
7761
7762 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7763 staticpro (&Qbuffer_file_coding_system);
7764
7765 Qpost_read_conversion = intern ("post-read-conversion");
7766 staticpro (&Qpost_read_conversion);
7767
7768 Qpre_write_conversion = intern ("pre-write-conversion");
7769 staticpro (&Qpre_write_conversion);
7770
7771 Qno_conversion = intern ("no-conversion");
7772 staticpro (&Qno_conversion);
7773
7774 Qundecided = intern ("undecided");
7775 staticpro (&Qundecided);
7776
7777 Qcoding_system_p = intern ("coding-system-p");
7778 staticpro (&Qcoding_system_p);
7779
7780 Qcoding_system_error = intern ("coding-system-error");
7781 staticpro (&Qcoding_system_error);
7782
7783 Fput (Qcoding_system_error, Qerror_conditions,
7784 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7785 Fput (Qcoding_system_error, Qerror_message,
7786 build_string ("Invalid coding system"));
7787
7788 Qcoding_category = intern ("coding-category");
7789 staticpro (&Qcoding_category);
7790 Qcoding_category_index = intern ("coding-category-index");
7791 staticpro (&Qcoding_category_index);
7792
7793 Vcoding_category_table
7794 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7795 staticpro (&Vcoding_category_table);
7796 {
7797 int i;
7798 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7799 {
7800 XVECTOR (Vcoding_category_table)->contents[i]
7801 = intern (coding_category_name[i]);
7802 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7803 Qcoding_category_index, make_number (i));
7804 }
7805 }
7806
7807 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7808 staticpro (&Vcoding_system_safe_chars);
7809
7810 Qtranslation_table = intern ("translation-table");
7811 staticpro (&Qtranslation_table);
7812 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7813
7814 Qtranslation_table_id = intern ("translation-table-id");
7815 staticpro (&Qtranslation_table_id);
7816
7817 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7818 staticpro (&Qtranslation_table_for_decode);
7819
7820 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7821 staticpro (&Qtranslation_table_for_encode);
7822
7823 Qsafe_chars = intern ("safe-chars");
7824 staticpro (&Qsafe_chars);
7825
7826 Qchar_coding_system = intern ("char-coding-system");
7827 staticpro (&Qchar_coding_system);
7828
7829 /* Intern this now in case it isn't already done.
7830 Setting this variable twice is harmless.
7831 But don't staticpro it here--that is done in alloc.c. */
7832 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7833 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7834 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7835
7836 Qvalid_codes = intern ("valid-codes");
7837 staticpro (&Qvalid_codes);
7838
7839 Qascii_incompatible = intern ("ascii-incompatible");
7840 staticpro (&Qascii_incompatible);
7841
7842 Qemacs_mule = intern ("emacs-mule");
7843 staticpro (&Qemacs_mule);
7844
7845 Qraw_text = intern ("raw-text");
7846 staticpro (&Qraw_text);
7847
7848 Qutf_8 = intern ("utf-8");
7849 staticpro (&Qutf_8);
7850
7851 Qcoding_system_define_form = intern ("coding-system-define-form");
7852 staticpro (&Qcoding_system_define_form);
7853
7854 defsubr (&Scoding_system_p);
7855 defsubr (&Sread_coding_system);
7856 defsubr (&Sread_non_nil_coding_system);
7857 defsubr (&Scheck_coding_system);
7858 defsubr (&Sdetect_coding_region);
7859 defsubr (&Sdetect_coding_string);
7860 defsubr (&Sfind_coding_systems_region_internal);
7861 defsubr (&Sunencodable_char_position);
7862 defsubr (&Sdecode_coding_region);
7863 defsubr (&Sencode_coding_region);
7864 defsubr (&Sdecode_coding_string);
7865 defsubr (&Sencode_coding_string);
7866 defsubr (&Sdecode_sjis_char);
7867 defsubr (&Sencode_sjis_char);
7868 defsubr (&Sdecode_big5_char);
7869 defsubr (&Sencode_big5_char);
7870 defsubr (&Sset_terminal_coding_system_internal);
7871 defsubr (&Sset_safe_terminal_coding_system_internal);
7872 defsubr (&Sterminal_coding_system);
7873 defsubr (&Sset_keyboard_coding_system_internal);
7874 defsubr (&Skeyboard_coding_system);
7875 defsubr (&Sfind_operation_coding_system);
7876 defsubr (&Supdate_coding_systems_internal);
7877 defsubr (&Sset_coding_priority_internal);
7878 defsubr (&Sdefine_coding_system_internal);
7879
7880 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7881 doc: /* List of coding systems.
7882
7883 Do not alter the value of this variable manually. This variable should be
7884 updated by the functions `make-coding-system' and
7885 `define-coding-system-alias'. */);
7886 Vcoding_system_list = Qnil;
7887
7888 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7889 doc: /* Alist of coding system names.
7890 Each element is one element list of coding system name.
7891 This variable is given to `completing-read' as TABLE argument.
7892
7893 Do not alter the value of this variable manually. This variable should be
7894 updated by the functions `make-coding-system' and
7895 `define-coding-system-alias'. */);
7896 Vcoding_system_alist = Qnil;
7897
7898 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7899 doc: /* List of coding-categories (symbols) ordered by priority.
7900
7901 On detecting a coding system, Emacs tries code detection algorithms
7902 associated with each coding-category one by one in this order. When
7903 one algorithm agrees with a byte sequence of source text, the coding
7904 system bound to the corresponding coding-category is selected.
7905
7906 Don't modify this variable directly, but use `set-coding-priority'. */);
7907 {
7908 int i;
7909
7910 Vcoding_category_list = Qnil;
7911 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7912 Vcoding_category_list
7913 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7914 Vcoding_category_list);
7915 }
7916
7917 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7918 doc: /* Specify the coding system for read operations.
7919 It is useful to bind this variable with `let', but do not set it globally.
7920 If the value is a coding system, it is used for decoding on read operation.
7921 If not, an appropriate element is used from one of the coding system alists:
7922 There are three such tables, `file-coding-system-alist',
7923 `process-coding-system-alist', and `network-coding-system-alist'. */);
7924 Vcoding_system_for_read = Qnil;
7925
7926 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7927 doc: /* Specify the coding system for write operations.
7928 Programs bind this variable with `let', but you should not set it globally.
7929 If the value is a coding system, it is used for encoding of output,
7930 when writing it to a file and when sending it to a file or subprocess.
7931
7932 If this does not specify a coding system, an appropriate element
7933 is used from one of the coding system alists:
7934 There are three such tables, `file-coding-system-alist',
7935 `process-coding-system-alist', and `network-coding-system-alist'.
7936 For output to files, if the above procedure does not specify a coding system,
7937 the value of `buffer-file-coding-system' is used. */);
7938 Vcoding_system_for_write = Qnil;
7939
7940 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7941 doc: /* Coding system used in the latest file or process I/O.
7942 Also set by `encode-coding-region', `decode-coding-region',
7943 `encode-coding-string' and `decode-coding-string'. */);
7944 Vlast_coding_system_used = Qnil;
7945
7946 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7947 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7948 See info node `Coding Systems' and info node `Text and Binary' concerning
7949 such conversion. */);
7950 inhibit_eol_conversion = 0;
7951
7952 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7953 doc: /* Non-nil means process buffer inherits coding system of process output.
7954 Bind it to t if the process output is to be treated as if it were a file
7955 read from some filesystem. */);
7956 inherit_process_coding_system = 0;
7957
7958 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7959 doc: /* Alist to decide a coding system to use for a file I/O operation.
7960 The format is ((PATTERN . VAL) ...),
7961 where PATTERN is a regular expression matching a file name,
7962 VAL is a coding system, a cons of coding systems, or a function symbol.
7963 If VAL is a coding system, it is used for both decoding and encoding
7964 the file contents.
7965 If VAL is a cons of coding systems, the car part is used for decoding,
7966 and the cdr part is used for encoding.
7967 If VAL is a function symbol, the function must return a coding system
7968 or a cons of coding systems which are used as above. The function gets
7969 the arguments with which `find-operation-coding-system' was called.
7970
7971 See also the function `find-operation-coding-system'
7972 and the variable `auto-coding-alist'. */);
7973 Vfile_coding_system_alist = Qnil;
7974
7975 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7976 doc: /* Alist to decide a coding system to use for a process I/O operation.
7977 The format is ((PATTERN . VAL) ...),
7978 where PATTERN is a regular expression matching a program name,
7979 VAL is a coding system, a cons of coding systems, or a function symbol.
7980 If VAL is a coding system, it is used for both decoding what received
7981 from the program and encoding what sent to the program.
7982 If VAL is a cons of coding systems, the car part is used for decoding,
7983 and the cdr part is used for encoding.
7984 If VAL is a function symbol, the function must return a coding system
7985 or a cons of coding systems which are used as above.
7986
7987 See also the function `find-operation-coding-system'. */);
7988 Vprocess_coding_system_alist = Qnil;
7989
7990 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7991 doc: /* Alist to decide a coding system to use for a network I/O operation.
7992 The format is ((PATTERN . VAL) ...),
7993 where PATTERN is a regular expression matching a network service name
7994 or is a port number to connect to,
7995 VAL is a coding system, a cons of coding systems, or a function symbol.
7996 If VAL is a coding system, it is used for both decoding what received
7997 from the network stream and encoding what sent to the network stream.
7998 If VAL is a cons of coding systems, the car part is used for decoding,
7999 and the cdr part is used for encoding.
8000 If VAL is a function symbol, the function must return a coding system
8001 or a cons of coding systems which are used as above.
8002
8003 See also the function `find-operation-coding-system'. */);
8004 Vnetwork_coding_system_alist = Qnil;
8005
8006 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8007 doc: /* Coding system to use with system messages.
8008 Also used for decoding keyboard input on X Window system. */);
8009 Vlocale_coding_system = Qnil;
8010
8011 /* The eol mnemonics are reset in startup.el system-dependently. */
8012 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8013 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8014 eol_mnemonic_unix = build_string (":");
8015
8016 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8017 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8018 eol_mnemonic_dos = build_string ("\\");
8019
8020 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8021 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8022 eol_mnemonic_mac = build_string ("/");
8023
8024 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8025 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8026 eol_mnemonic_undecided = build_string (":");
8027
8028 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8029 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8030 Venable_character_translation = Qt;
8031
8032 DEFVAR_LISP ("standard-translation-table-for-decode",
8033 &Vstandard_translation_table_for_decode,
8034 doc: /* Table for translating characters while decoding. */);
8035 Vstandard_translation_table_for_decode = Qnil;
8036
8037 DEFVAR_LISP ("standard-translation-table-for-encode",
8038 &Vstandard_translation_table_for_encode,
8039 doc: /* Table for translating characters while encoding. */);
8040 Vstandard_translation_table_for_encode = Qnil;
8041
8042 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8043 doc: /* Alist of charsets vs revision numbers.
8044 While encoding, if a charset (car part of an element) is found,
8045 designate it with the escape sequence identifying revision (cdr part of the element). */);
8046 Vcharset_revision_alist = Qnil;
8047
8048 DEFVAR_LISP ("default-process-coding-system",
8049 &Vdefault_process_coding_system,
8050 doc: /* Cons of coding systems used for process I/O by default.
8051 The car part is used for decoding a process output,
8052 the cdr part is used for encoding a text to be sent to a process. */);
8053 Vdefault_process_coding_system = Qnil;
8054
8055 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8056 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8057 This is a vector of length 256.
8058 If Nth element is non-nil, the existence of code N in a file
8059 \(or output of subprocess) doesn't prevent it to be detected as
8060 a coding system of ISO 2022 variant which has a flag
8061 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8062 or reading output of a subprocess.
8063 Only 128th through 159th elements has a meaning. */);
8064 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8065
8066 DEFVAR_LISP ("select-safe-coding-system-function",
8067 &Vselect_safe_coding_system_function,
8068 doc: /* Function to call to select safe coding system for encoding a text.
8069
8070 If set, this function is called to force a user to select a proper
8071 coding system which can encode the text in the case that a default
8072 coding system used in each operation can't encode the text.
8073
8074 The default value is `select-safe-coding-system' (which see). */);
8075 Vselect_safe_coding_system_function = Qnil;
8076
8077 DEFVAR_BOOL ("coding-system-require-warning",
8078 &coding_system_require_warning,
8079 doc: /* Internal use only.
8080 If non-nil, on writing a file, `select-safe-coding-system-function' is
8081 called even if `coding-system-for-write' is non-nil. The command
8082 `universal-coding-system-argument' binds this variable to t temporarily. */);
8083 coding_system_require_warning = 0;
8084
8085
8086 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8087 &inhibit_iso_escape_detection,
8088 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8089
8090 By default, on reading a file, Emacs tries to detect how the text is
8091 encoded. This code detection is sensitive to escape sequences. If
8092 the sequence is valid as ISO2022, the code is determined as one of
8093 the ISO2022 encodings, and the file is decoded by the corresponding
8094 coding system (e.g. `iso-2022-7bit').
8095
8096 However, there may be a case that you want to read escape sequences in
8097 a file as is. In such a case, you can set this variable to non-nil.
8098 Then, as the code detection ignores any escape sequences, no file is
8099 detected as encoded in some ISO2022 encoding. The result is that all
8100 escape sequences become visible in a buffer.
8101
8102 The default value is nil, and it is strongly recommended not to change
8103 it. That is because many Emacs Lisp source files that contain
8104 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8105 in Emacs's distribution, and they won't be decoded correctly on
8106 reading if you suppress escape sequence detection.
8107
8108 The other way to read escape sequences in a file without decoding is
8109 to explicitly specify some coding system that doesn't use ISO2022's
8110 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8111 inhibit_iso_escape_detection = 0;
8112
8113 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8114 doc: /* Char table for translating self-inserting characters.
8115 This is applied to the result of input methods, not their input. See also
8116 `keyboard-translate-table'. */);
8117 Vtranslation_table_for_input = Qnil;
8118 }
8119
8120 char *
8121 emacs_strerror (error_number)
8122 int error_number;
8123 {
8124 char *str;
8125
8126 synchronize_system_messages_locale ();
8127 str = strerror (error_number);
8128
8129 if (! NILP (Vlocale_coding_system))
8130 {
8131 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8132 Vlocale_coding_system,
8133 0);
8134 str = (char *) SDATA (dec);
8135 }
8136
8137 return str;
8138 }
8139
8140 #endif /* emacs */
8141
8142 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8143 (do not change this comment) */