]> code.delx.au - gnu-emacs/blob - src/coding.c
(Ffind_operation_coding_system): Call a function by
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7
8 This file is part of GNU Emacs.
9
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
24
25 /*** TABLE OF CONTENTS ***
26
27 0. General comments
28 1. Preamble
29 2. Emacs' internal format (emacs-mule) handlers
30 3. ISO2022 handlers
31 4. Shift-JIS and BIG5 handlers
32 5. CCL handlers
33 6. End-of-line handlers
34 7. C library functions
35 8. Emacs Lisp library functions
36 9. Post-amble
37
38 */
39
40 /*** 0. General comments ***/
41
42
43 /*** GENERAL NOTE on CODING SYSTEMS ***
44
45 A coding system is an encoding mechanism for one or more character
46 sets. Here's a list of coding systems which Emacs can handle. When
47 we say "decode", it means converting some other coding system to
48 Emacs' internal format (emacs-mule), and when we say "encode",
49 it means converting the coding system emacs-mule to some other
50 coding system.
51
52 0. Emacs' internal format (emacs-mule)
53
54 Emacs itself holds a multi-lingual character in buffers and strings
55 in a special format. Details are described in section 2.
56
57 1. ISO2022
58
59 The most famous coding system for multiple character sets. X's
60 Compound Text, various EUCs (Extended Unix Code), and coding
61 systems used in Internet communication such as ISO-2022-JP are
62 all variants of ISO2022. Details are described in section 3.
63
64 2. SJIS (or Shift-JIS or MS-Kanji-Code)
65
66 A coding system to encode character sets: ASCII, JISX0201, and
67 JISX0208. Widely used for PC's in Japan. Details are described in
68 section 4.
69
70 3. BIG5
71
72 A coding system to encode the character sets ASCII and Big5. Widely
73 used for Chinese (mainly in Taiwan and Hong Kong). Details are
74 described in section 4. In this file, when we write "BIG5"
75 (all uppercase), we mean the coding system, and when we write
76 "Big5" (capitalized), we mean the character set.
77
78 4. Raw text
79
80 A coding system for text containing random 8-bit code. Emacs does
81 no code conversion on such text except for end-of-line format.
82
83 5. Other
84
85 If a user wants to read/write text encoded in a coding system not
86 listed above, he can supply a decoder and an encoder for it as CCL
87 (Code Conversion Language) programs. Emacs executes the CCL program
88 while reading/writing.
89
90 Emacs represents a coding system by a Lisp symbol that has a property
91 `coding-system'. But, before actually using the coding system, the
92 information about it is set in a structure of type `struct
93 coding_system' for rapid processing. See section 6 for more details.
94
95 */
96
97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
98
99 How end-of-line of text is encoded depends on the operating system.
100 For instance, Unix's format is just one byte of `line-feed' code,
101 whereas DOS's format is two-byte sequence of `carriage-return' and
102 `line-feed' codes. MacOS's format is usually one byte of
103 `carriage-return'.
104
105 Since text character encoding and end-of-line encoding are
106 independent, any coding system described above can have any
107 end-of-line format. So Emacs has information about end-of-line
108 format in each coding-system. See section 6 for more details.
109
110 */
111
112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
113
114 These functions check if a text between SRC and SRC_END is encoded
115 in the coding system category XXX. Each returns an integer value in
116 which appropriate flag bits for the category XXX are set. The flag
117 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
118 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
119 of the range 0x80..0x9F are in multibyte form. */
120 #if 0
121 int
122 detect_coding_emacs_mule (src, src_end, multibytep)
123 unsigned char *src, *src_end;
124 int multibytep;
125 {
126 ...
127 }
128 #endif
129
130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
131
132 These functions decode SRC_BYTES length of unibyte text at SOURCE
133 encoded in CODING to Emacs' internal format. The resulting
134 multibyte text goes to a place pointed to by DESTINATION, the length
135 of which should not exceed DST_BYTES.
136
137 These functions set the information about original and decoded texts
138 in the members `produced', `produced_char', `consumed', and
139 `consumed_char' of the structure *CODING. They also set the member
140 `result' to one of CODING_FINISH_XXX indicating how the decoding
141 finished.
142
143 DST_BYTES zero means that the source area and destination area are
144 overlapped, which means that we can produce a decoded text until it
145 reaches the head of the not-yet-decoded source text.
146
147 Below is a template for these functions. */
148 #if 0
149 static void
150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
151 struct coding_system *coding;
152 const unsigned char *source;
153 unsigned char *destination;
154 int src_bytes, dst_bytes;
155 {
156 ...
157 }
158 #endif
159
160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
161
162 These functions encode SRC_BYTES length text at SOURCE from Emacs'
163 internal multibyte format to CODING. The resulting unibyte text
164 goes to a place pointed to by DESTINATION, the length of which
165 should not exceed DST_BYTES.
166
167 These functions set the information about original and encoded texts
168 in the members `produced', `produced_char', `consumed', and
169 `consumed_char' of the structure *CODING. They also set the member
170 `result' to one of CODING_FINISH_XXX indicating how the encoding
171 finished.
172
173 DST_BYTES zero means that the source area and destination area are
174 overlapped, which means that we can produce encoded text until it
175 reaches at the head of the not-yet-encoded source text.
176
177 Below is a template for these functions. */
178 #if 0
179 static void
180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
181 struct coding_system *coding;
182 unsigned char *source, *destination;
183 int src_bytes, dst_bytes;
184 {
185 ...
186 }
187 #endif
188
189 /*** COMMONLY USED MACROS ***/
190
191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
192 get one, two, and three bytes from the source text respectively.
193 If there are not enough bytes in the source, they jump to
194 `label_end_of_loop'. The caller should set variables `coding',
195 `src' and `src_end' to appropriate pointer in advance. These
196 macros are called from decoding routines `decode_coding_XXX', thus
197 it is assumed that the source text is unibyte. */
198
199 #define ONE_MORE_BYTE(c1) \
200 do { \
201 if (src >= src_end) \
202 { \
203 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
204 goto label_end_of_loop; \
205 } \
206 c1 = *src++; \
207 } while (0)
208
209 #define TWO_MORE_BYTES(c1, c2) \
210 do { \
211 if (src + 1 >= src_end) \
212 { \
213 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
214 goto label_end_of_loop; \
215 } \
216 c1 = *src++; \
217 c2 = *src++; \
218 } while (0)
219
220
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
222 form if MULTIBYTEP is nonzero. */
223
224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
225 do { \
226 if (src >= src_end) \
227 { \
228 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
229 goto label_end_of_loop; \
230 } \
231 c1 = *src++; \
232 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
233 c1 = *src++ - 0x20; \
234 } while (0)
235
236 /* Set C to the next character at the source text pointed by `src'.
237 If there are not enough characters in the source, jump to
238 `label_end_of_loop'. The caller should set variables `coding'
239 `src', `src_end', and `translation_table' to appropriate pointers
240 in advance. This macro is used in encoding routines
241 `encode_coding_XXX', thus it assumes that the source text is in
242 multibyte form except for 8-bit characters. 8-bit characters are
243 in multibyte form if coding->src_multibyte is nonzero, else they
244 are represented by a single byte. */
245
246 #define ONE_MORE_CHAR(c) \
247 do { \
248 int len = src_end - src; \
249 int bytes; \
250 if (len <= 0) \
251 { \
252 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
253 goto label_end_of_loop; \
254 } \
255 if (coding->src_multibyte \
256 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
257 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
258 else \
259 c = *src, bytes = 1; \
260 if (!NILP (translation_table)) \
261 c = translate_char (translation_table, c, -1, 0, 0); \
262 src += bytes; \
263 } while (0)
264
265
266 /* Produce a multibyte form of character C to `dst'. Jump to
267 `label_end_of_loop' if there's not enough space at `dst'.
268
269 If we are now in the middle of a composition sequence, the decoded
270 character may be ALTCHAR (for the current composition). In that
271 case, the character goes to coding->cmp_data->data instead of
272 `dst'.
273
274 This macro is used in decoding routines. */
275
276 #define EMIT_CHAR(c) \
277 do { \
278 if (! COMPOSING_P (coding) \
279 || coding->composing == COMPOSITION_RELATIVE \
280 || coding->composing == COMPOSITION_WITH_RULE) \
281 { \
282 int bytes = CHAR_BYTES (c); \
283 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
284 { \
285 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
286 goto label_end_of_loop; \
287 } \
288 dst += CHAR_STRING (c, dst); \
289 coding->produced_char++; \
290 } \
291 \
292 if (COMPOSING_P (coding) \
293 && coding->composing != COMPOSITION_RELATIVE) \
294 { \
295 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
296 coding->composition_rule_follows \
297 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
298 } \
299 } while (0)
300
301
302 #define EMIT_ONE_BYTE(c) \
303 do { \
304 if (dst >= (dst_bytes ? dst_end : src)) \
305 { \
306 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
307 goto label_end_of_loop; \
308 } \
309 *dst++ = c; \
310 } while (0)
311
312 #define EMIT_TWO_BYTES(c1, c2) \
313 do { \
314 if (dst + 2 > (dst_bytes ? dst_end : src)) \
315 { \
316 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
317 goto label_end_of_loop; \
318 } \
319 *dst++ = c1, *dst++ = c2; \
320 } while (0)
321
322 #define EMIT_BYTES(from, to) \
323 do { \
324 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
325 { \
326 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
327 goto label_end_of_loop; \
328 } \
329 while (from < to) \
330 *dst++ = *from++; \
331 } while (0)
332
333 \f
334 /*** 1. Preamble ***/
335
336 #ifdef emacs
337 #include <config.h>
338 #endif
339
340 #include <stdio.h>
341
342 #ifdef emacs
343
344 #include "lisp.h"
345 #include "buffer.h"
346 #include "charset.h"
347 #include "composite.h"
348 #include "ccl.h"
349 #include "coding.h"
350 #include "window.h"
351 #include "intervals.h"
352
353 #else /* not emacs */
354
355 #include "mulelib.h"
356
357 #endif /* not emacs */
358
359 Lisp_Object Qcoding_system, Qeol_type;
360 Lisp_Object Qbuffer_file_coding_system;
361 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
362 Lisp_Object Qno_conversion, Qundecided;
363 Lisp_Object Qcoding_system_history;
364 Lisp_Object Qsafe_chars;
365 Lisp_Object Qvalid_codes;
366 Lisp_Object Qascii_incompatible;
367
368 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
369 Lisp_Object Qcall_process, Qcall_process_region;
370 Lisp_Object Qstart_process, Qopen_network_stream;
371 Lisp_Object Qtarget_idx;
372
373 /* If a symbol has this property, evaluate the value to define the
374 symbol as a coding system. */
375 Lisp_Object Qcoding_system_define_form;
376
377 Lisp_Object Vselect_safe_coding_system_function;
378
379 int coding_system_require_warning;
380
381 /* Mnemonic string for each format of end-of-line. */
382 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
383 /* Mnemonic string to indicate format of end-of-line is not yet
384 decided. */
385 Lisp_Object eol_mnemonic_undecided;
386
387 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
388 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
389 This has an effect only for external encoding (i.e. for output to
390 file and process), not for in-buffer or Lisp string encoding. */
391 int system_eol_type;
392
393 #ifdef emacs
394
395 /* Information about which coding system is safe for which chars.
396 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
397
398 GENERIC-LIST is a list of generic coding systems which can encode
399 any characters.
400
401 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
402 corresponding char table that contains safe chars. */
403 Lisp_Object Vcoding_system_safe_chars;
404
405 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
406
407 Lisp_Object Qcoding_system_p, Qcoding_system_error;
408
409 /* Coding system emacs-mule and raw-text are for converting only
410 end-of-line format. */
411 Lisp_Object Qemacs_mule, Qraw_text;
412
413 Lisp_Object Qutf_8;
414
415 /* Coding-systems are handed between Emacs Lisp programs and C internal
416 routines by the following three variables. */
417 /* Coding-system for reading files and receiving data from process. */
418 Lisp_Object Vcoding_system_for_read;
419 /* Coding-system for writing files and sending data to process. */
420 Lisp_Object Vcoding_system_for_write;
421 /* Coding-system actually used in the latest I/O. */
422 Lisp_Object Vlast_coding_system_used;
423
424 /* A vector of length 256 which contains information about special
425 Latin codes (especially for dealing with Microsoft codes). */
426 Lisp_Object Vlatin_extra_code_table;
427
428 /* Flag to inhibit code conversion of end-of-line format. */
429 int inhibit_eol_conversion;
430
431 /* Flag to inhibit ISO2022 escape sequence detection. */
432 int inhibit_iso_escape_detection;
433
434 /* Flag to make buffer-file-coding-system inherit from process-coding. */
435 int inherit_process_coding_system;
436
437 /* Coding system to be used to encode text for terminal display. */
438 struct coding_system terminal_coding;
439
440 /* Coding system to be used to encode text for terminal display when
441 terminal coding system is nil. */
442 struct coding_system safe_terminal_coding;
443
444 /* Coding system of what is sent from terminal keyboard. */
445 struct coding_system keyboard_coding;
446
447 /* Default coding system to be used to write a file. */
448 struct coding_system default_buffer_file_coding;
449
450 Lisp_Object Vfile_coding_system_alist;
451 Lisp_Object Vprocess_coding_system_alist;
452 Lisp_Object Vnetwork_coding_system_alist;
453
454 Lisp_Object Vlocale_coding_system;
455
456 #endif /* emacs */
457
458 Lisp_Object Qcoding_category, Qcoding_category_index;
459
460 /* List of symbols `coding-category-xxx' ordered by priority. */
461 Lisp_Object Vcoding_category_list;
462
463 /* Table of coding categories (Lisp symbols). */
464 Lisp_Object Vcoding_category_table;
465
466 /* Table of names of symbol for each coding-category. */
467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
468 "coding-category-emacs-mule",
469 "coding-category-sjis",
470 "coding-category-iso-7",
471 "coding-category-iso-7-tight",
472 "coding-category-iso-8-1",
473 "coding-category-iso-8-2",
474 "coding-category-iso-7-else",
475 "coding-category-iso-8-else",
476 "coding-category-ccl",
477 "coding-category-big5",
478 "coding-category-utf-8",
479 "coding-category-utf-16-be",
480 "coding-category-utf-16-le",
481 "coding-category-raw-text",
482 "coding-category-binary"
483 };
484
485 /* Table of pointers to coding systems corresponding to each coding
486 categories. */
487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
488
489 /* Table of coding category masks. Nth element is a mask for a coding
490 category of which priority is Nth. */
491 static
492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
493
494 /* Flag to tell if we look up translation table on character code
495 conversion. */
496 Lisp_Object Venable_character_translation;
497 /* Standard translation table to look up on decoding (reading). */
498 Lisp_Object Vstandard_translation_table_for_decode;
499 /* Standard translation table to look up on encoding (writing). */
500 Lisp_Object Vstandard_translation_table_for_encode;
501
502 Lisp_Object Qtranslation_table;
503 Lisp_Object Qtranslation_table_id;
504 Lisp_Object Qtranslation_table_for_decode;
505 Lisp_Object Qtranslation_table_for_encode;
506
507 /* Alist of charsets vs revision number. */
508 Lisp_Object Vcharset_revision_alist;
509
510 /* Default coding systems used for process I/O. */
511 Lisp_Object Vdefault_process_coding_system;
512
513 /* Char table for translating Quail and self-inserting input. */
514 Lisp_Object Vtranslation_table_for_input;
515
516 /* Global flag to tell that we can't call post-read-conversion and
517 pre-write-conversion functions. Usually the value is zero, but it
518 is set to 1 temporarily while such functions are running. This is
519 to avoid infinite recursive call. */
520 static int inhibit_pre_post_conversion;
521
522 Lisp_Object Qchar_coding_system;
523
524 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
525 its validity. */
526
527 Lisp_Object
528 coding_safe_chars (coding_system)
529 Lisp_Object coding_system;
530 {
531 Lisp_Object coding_spec, plist, safe_chars;
532
533 coding_spec = Fget (coding_system, Qcoding_system);
534 plist = XVECTOR (coding_spec)->contents[3];
535 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
536 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
537 }
538
539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
540 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
541
542 \f
543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
544
545 /* Emacs' internal format for representation of multiple character
546 sets is a kind of multi-byte encoding, i.e. characters are
547 represented by variable-length sequences of one-byte codes.
548
549 ASCII characters and control characters (e.g. `tab', `newline') are
550 represented by one-byte sequences which are their ASCII codes, in
551 the range 0x00 through 0x7F.
552
553 8-bit characters of the range 0x80..0x9F are represented by
554 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
555 code + 0x20).
556
557 8-bit characters of the range 0xA0..0xFF are represented by
558 one-byte sequences which are their 8-bit code.
559
560 The other characters are represented by a sequence of `base
561 leading-code', optional `extended leading-code', and one or two
562 `position-code's. The length of the sequence is determined by the
563 base leading-code. Leading-code takes the range 0x81 through 0x9D,
564 whereas extended leading-code and position-code take the range 0xA0
565 through 0xFF. See `charset.h' for more details about leading-code
566 and position-code.
567
568 --- CODE RANGE of Emacs' internal format ---
569 character set range
570 ------------- -----
571 ascii 0x00..0x7F
572 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
573 eight-bit-graphic 0xA0..0xBF
574 ELSE 0x81..0x9D + [0xA0..0xFF]+
575 ---------------------------------------------
576
577 As this is the internal character representation, the format is
578 usually not used externally (i.e. in a file or in a data sent to a
579 process). But, it is possible to have a text externally in this
580 format (i.e. by encoding by the coding system `emacs-mule').
581
582 In that case, a sequence of one-byte codes has a slightly different
583 form.
584
585 Firstly, all characters in eight-bit-control are represented by
586 one-byte sequences which are their 8-bit code.
587
588 Next, character composition data are represented by the byte
589 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
590 where,
591 METHOD is 0xF0 plus one of composition method (enum
592 composition_method),
593
594 BYTES is 0xA0 plus the byte length of these composition data,
595
596 CHARS is 0xA0 plus the number of characters composed by these
597 data,
598
599 COMPONENTs are characters of multibyte form or composition
600 rules encoded by two-byte of ASCII codes.
601
602 In addition, for backward compatibility, the following formats are
603 also recognized as composition data on decoding.
604
605 0x80 MSEQ ...
606 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
607
608 Here,
609 MSEQ is a multibyte form but in these special format:
610 ASCII: 0xA0 ASCII_CODE+0x80,
611 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
612 RULE is a one byte code of the range 0xA0..0xF0 that
613 represents a composition rule.
614 */
615
616 enum emacs_code_class_type emacs_code_class[256];
617
618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
619 Check if a text is encoded in Emacs' internal format. If it is,
620 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
621
622 static int
623 detect_coding_emacs_mule (src, src_end, multibytep)
624 unsigned char *src, *src_end;
625 int multibytep;
626 {
627 unsigned char c;
628 int composing = 0;
629 /* Dummy for ONE_MORE_BYTE. */
630 struct coding_system dummy_coding;
631 struct coding_system *coding = &dummy_coding;
632
633 while (1)
634 {
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
636
637 if (composing)
638 {
639 if (c < 0xA0)
640 composing = 0;
641 else if (c == 0xA0)
642 {
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
644 c &= 0x7F;
645 }
646 else
647 c -= 0x20;
648 }
649
650 if (c < 0x20)
651 {
652 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
653 return 0;
654 }
655 else if (c >= 0x80 && c < 0xA0)
656 {
657 if (c == 0x80)
658 /* Old leading code for a composite character. */
659 composing = 1;
660 else
661 {
662 unsigned char *src_base = src - 1;
663 int bytes;
664
665 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
666 bytes))
667 return 0;
668 src = src_base + bytes;
669 }
670 }
671 }
672 label_end_of_loop:
673 return CODING_CATEGORY_MASK_EMACS_MULE;
674 }
675
676
677 /* Record the starting position START and METHOD of one composition. */
678
679 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
680 do { \
681 struct composition_data *cmp_data = coding->cmp_data; \
682 int *data = cmp_data->data + cmp_data->used; \
683 coding->cmp_data_start = cmp_data->used; \
684 data[0] = -1; \
685 data[1] = cmp_data->char_offset + start; \
686 data[3] = (int) method; \
687 cmp_data->used += 4; \
688 } while (0)
689
690 /* Record the ending position END of the current composition. */
691
692 #define CODING_ADD_COMPOSITION_END(coding, end) \
693 do { \
694 struct composition_data *cmp_data = coding->cmp_data; \
695 int *data = cmp_data->data + coding->cmp_data_start; \
696 data[0] = cmp_data->used - coding->cmp_data_start; \
697 data[2] = cmp_data->char_offset + end; \
698 } while (0)
699
700 /* Record one COMPONENT (alternate character or composition rule). */
701
702 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
703 do { \
704 coding->cmp_data->data[coding->cmp_data->used++] = component; \
705 if (coding->cmp_data->used - coding->cmp_data_start \
706 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
707 { \
708 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
709 coding->composing = COMPOSITION_NO; \
710 } \
711 } while (0)
712
713
714 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
715 is not less than SRC_END, return -1 without incrementing Src. */
716
717 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
718
719
720 /* Decode a character represented as a component of composition
721 sequence of Emacs 20 style at SRC. Set C to that character, store
722 its multibyte form sequence at P, and set P to the end of that
723 sequence. If no valid character is found, set C to -1. */
724
725 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
726 do { \
727 int bytes; \
728 \
729 c = SAFE_ONE_MORE_BYTE (); \
730 if (c < 0) \
731 break; \
732 if (CHAR_HEAD_P (c)) \
733 c = -1; \
734 else if (c == 0xA0) \
735 { \
736 c = SAFE_ONE_MORE_BYTE (); \
737 if (c < 0xA0) \
738 c = -1; \
739 else \
740 { \
741 c -= 0x80; \
742 *p++ = c; \
743 } \
744 } \
745 else if (BASE_LEADING_CODE_P (c - 0x20)) \
746 { \
747 unsigned char *p0 = p; \
748 \
749 c -= 0x20; \
750 *p++ = c; \
751 bytes = BYTES_BY_CHAR_HEAD (c); \
752 while (--bytes) \
753 { \
754 c = SAFE_ONE_MORE_BYTE (); \
755 if (c < 0) \
756 break; \
757 *p++ = c; \
758 } \
759 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
760 || (coding->flags /* We are recovering a file. */ \
761 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
762 && ! CHAR_HEAD_P (p0[1]))) \
763 c = STRING_CHAR (p0, bytes); \
764 else \
765 c = -1; \
766 } \
767 else \
768 c = -1; \
769 } while (0)
770
771
772 /* Decode a composition rule represented as a component of composition
773 sequence of Emacs 20 style at SRC. Set C to the rule. If not
774 valid rule is found, set C to -1. */
775
776 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
777 do { \
778 c = SAFE_ONE_MORE_BYTE (); \
779 c -= 0xA0; \
780 if (c < 0 || c >= 81) \
781 c = -1; \
782 else \
783 { \
784 gref = c / 9, nref = c % 9; \
785 c = COMPOSITION_ENCODE_RULE (gref, nref); \
786 } \
787 } while (0)
788
789
790 /* Decode composition sequence encoded by `emacs-mule' at the source
791 pointed by SRC. SRC_END is the end of source. Store information
792 of the composition in CODING->cmp_data.
793
794 For backward compatibility, decode also a composition sequence of
795 Emacs 20 style. In that case, the composition sequence contains
796 characters that should be extracted into a buffer or string. Store
797 those characters at *DESTINATION in multibyte form.
798
799 If we encounter an invalid byte sequence, return 0.
800 If we encounter an insufficient source or destination, or
801 insufficient space in CODING->cmp_data, return 1.
802 Otherwise, return consumed bytes in the source.
803
804 */
805 static INLINE int
806 decode_composition_emacs_mule (coding, src, src_end,
807 destination, dst_end, dst_bytes)
808 struct coding_system *coding;
809 const unsigned char *src, *src_end;
810 unsigned char **destination, *dst_end;
811 int dst_bytes;
812 {
813 unsigned char *dst = *destination;
814 int method, data_len, nchars;
815 const unsigned char *src_base = src++;
816 /* Store components of composition. */
817 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
818 int ncomponent;
819 /* Store multibyte form of characters to be composed. This is for
820 Emacs 20 style composition sequence. */
821 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
822 unsigned char *bufp = buf;
823 int c, i, gref, nref;
824
825 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
826 >= COMPOSITION_DATA_SIZE)
827 {
828 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
829 return -1;
830 }
831
832 ONE_MORE_BYTE (c);
833 if (c - 0xF0 >= COMPOSITION_RELATIVE
834 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
835 {
836 int with_rule;
837
838 method = c - 0xF0;
839 with_rule = (method == COMPOSITION_WITH_RULE
840 || method == COMPOSITION_WITH_RULE_ALTCHARS);
841 ONE_MORE_BYTE (c);
842 data_len = c - 0xA0;
843 if (data_len < 4
844 || src_base + data_len > src_end)
845 return 0;
846 ONE_MORE_BYTE (c);
847 nchars = c - 0xA0;
848 if (c < 1)
849 return 0;
850 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
851 {
852 /* If it is longer than this, it can't be valid. */
853 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
854 return 0;
855
856 if (ncomponent % 2 && with_rule)
857 {
858 ONE_MORE_BYTE (gref);
859 gref -= 32;
860 ONE_MORE_BYTE (nref);
861 nref -= 32;
862 c = COMPOSITION_ENCODE_RULE (gref, nref);
863 }
864 else
865 {
866 int bytes;
867 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
868 || (coding->flags /* We are recovering a file. */
869 && src[0] == LEADING_CODE_8_BIT_CONTROL
870 && ! CHAR_HEAD_P (src[1])))
871 c = STRING_CHAR (src, bytes);
872 else
873 c = *src, bytes = 1;
874 src += bytes;
875 }
876 component[ncomponent] = c;
877 }
878 }
879 else if (c >= 0x80)
880 {
881 /* This may be an old Emacs 20 style format. See the comment at
882 the section 2 of this file. */
883 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
884 if (src == src_end
885 && !(coding->mode & CODING_MODE_LAST_BLOCK))
886 goto label_end_of_loop;
887
888 src_end = src;
889 src = src_base + 1;
890 if (c < 0xC0)
891 {
892 method = COMPOSITION_RELATIVE;
893 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
894 {
895 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
896 if (c < 0)
897 break;
898 component[ncomponent++] = c;
899 }
900 if (ncomponent < 2)
901 return 0;
902 nchars = ncomponent;
903 }
904 else if (c == 0xFF)
905 {
906 method = COMPOSITION_WITH_RULE;
907 src++;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 return 0;
911 component[0] = c;
912 for (ncomponent = 1;
913 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
914 {
915 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
916 if (c < 0)
917 break;
918 component[ncomponent++] = c;
919 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
920 if (c < 0)
921 break;
922 component[ncomponent++] = c;
923 }
924 if (ncomponent < 3)
925 return 0;
926 nchars = (ncomponent + 1) / 2;
927 }
928 else
929 return 0;
930 }
931 else
932 return 0;
933
934 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
935 {
936 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
937 for (i = 0; i < ncomponent; i++)
938 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
939 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
940 if (buf < bufp)
941 {
942 unsigned char *p = buf;
943 EMIT_BYTES (p, bufp);
944 *destination += bufp - buf;
945 coding->produced_char += nchars;
946 }
947 return (src - src_base);
948 }
949 label_end_of_loop:
950 return -1;
951 }
952
953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
954
955 static void
956 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
957 struct coding_system *coding;
958 const unsigned char *source;
959 unsigned char *destination;
960 int src_bytes, dst_bytes;
961 {
962 const unsigned char *src = source;
963 const unsigned char *src_end = source + src_bytes;
964 unsigned char *dst = destination;
965 unsigned char *dst_end = destination + dst_bytes;
966 /* SRC_BASE remembers the start position in source in each loop.
967 The loop will be exited when there's not enough source code, or
968 when there's not enough destination area to produce a
969 character. */
970 const unsigned char *src_base;
971
972 coding->produced_char = 0;
973 while ((src_base = src) < src_end)
974 {
975 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
976 const unsigned char *p;
977 int bytes;
978
979 if (*src == '\r')
980 {
981 int c = *src++;
982
983 if (coding->eol_type == CODING_EOL_CR)
984 c = '\n';
985 else if (coding->eol_type == CODING_EOL_CRLF)
986 {
987 ONE_MORE_BYTE (c);
988 if (c != '\n')
989 {
990 src--;
991 c = '\r';
992 }
993 }
994 *dst++ = c;
995 coding->produced_char++;
996 continue;
997 }
998 else if (*src == '\n')
999 {
1000 if ((coding->eol_type == CODING_EOL_CR
1001 || coding->eol_type == CODING_EOL_CRLF)
1002 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1003 {
1004 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1005 goto label_end_of_loop;
1006 }
1007 *dst++ = *src++;
1008 coding->produced_char++;
1009 continue;
1010 }
1011 else if (*src == 0x80 && coding->cmp_data)
1012 {
1013 /* Start of composition data. */
1014 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1015 &dst, dst_end,
1016 dst_bytes);
1017 if (consumed < 0)
1018 goto label_end_of_loop;
1019 else if (consumed > 0)
1020 {
1021 src += consumed;
1022 continue;
1023 }
1024 bytes = CHAR_STRING (*src, tmp);
1025 p = tmp;
1026 src++;
1027 }
1028 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1029 || (coding->flags /* We are recovering a file. */
1030 && src[0] == LEADING_CODE_8_BIT_CONTROL
1031 && ! CHAR_HEAD_P (src[1])))
1032 {
1033 p = src;
1034 src += bytes;
1035 }
1036 else
1037 {
1038 int i, c;
1039
1040 bytes = BYTES_BY_CHAR_HEAD (*src);
1041 src++;
1042 for (i = 1; i < bytes; i++)
1043 {
1044 ONE_MORE_BYTE (c);
1045 if (CHAR_HEAD_P (c))
1046 break;
1047 }
1048 if (i < bytes)
1049 {
1050 bytes = CHAR_STRING (*src_base, tmp);
1051 p = tmp;
1052 src = src_base + 1;
1053 }
1054 else
1055 {
1056 p = src_base;
1057 }
1058 }
1059 if (dst + bytes >= (dst_bytes ? dst_end : src))
1060 {
1061 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1062 break;
1063 }
1064 while (bytes--) *dst++ = *p++;
1065 coding->produced_char++;
1066 }
1067 label_end_of_loop:
1068 coding->consumed = coding->consumed_char = src_base - source;
1069 coding->produced = dst - destination;
1070 }
1071
1072
1073 /* Encode composition data stored at DATA into a special byte sequence
1074 starting by 0x80. Update CODING->cmp_data_start and maybe
1075 CODING->cmp_data for the next call. */
1076
1077 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1078 do { \
1079 unsigned char buf[1024], *p0 = buf, *p; \
1080 int len = data[0]; \
1081 int i; \
1082 \
1083 buf[0] = 0x80; \
1084 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1085 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1086 p = buf + 4; \
1087 if (data[3] == COMPOSITION_WITH_RULE \
1088 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1089 { \
1090 p += CHAR_STRING (data[4], p); \
1091 for (i = 5; i < len; i += 2) \
1092 { \
1093 int gref, nref; \
1094 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1095 *p++ = 0x20 + gref; \
1096 *p++ = 0x20 + nref; \
1097 p += CHAR_STRING (data[i + 1], p); \
1098 } \
1099 } \
1100 else \
1101 { \
1102 for (i = 4; i < len; i++) \
1103 p += CHAR_STRING (data[i], p); \
1104 } \
1105 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1106 \
1107 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1108 { \
1109 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1110 goto label_end_of_loop; \
1111 } \
1112 while (p0 < p) \
1113 *dst++ = *p0++; \
1114 coding->cmp_data_start += data[0]; \
1115 if (coding->cmp_data_start == coding->cmp_data->used \
1116 && coding->cmp_data->next) \
1117 { \
1118 coding->cmp_data = coding->cmp_data->next; \
1119 coding->cmp_data_start = 0; \
1120 } \
1121 } while (0)
1122
1123
1124 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1125 unsigned char *, int, int));
1126
1127 static void
1128 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1129 struct coding_system *coding;
1130 const unsigned char *source;
1131 unsigned char *destination;
1132 int src_bytes, dst_bytes;
1133 {
1134 const unsigned char *src = source;
1135 const unsigned char *src_end = source + src_bytes;
1136 unsigned char *dst = destination;
1137 unsigned char *dst_end = destination + dst_bytes;
1138 const unsigned char *src_base;
1139 int c;
1140 int char_offset;
1141 int *data;
1142
1143 Lisp_Object translation_table;
1144
1145 translation_table = Qnil;
1146
1147 /* Optimization for the case that there's no composition. */
1148 if (!coding->cmp_data || coding->cmp_data->used == 0)
1149 {
1150 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1151 return;
1152 }
1153
1154 char_offset = coding->cmp_data->char_offset;
1155 data = coding->cmp_data->data + coding->cmp_data_start;
1156 while (1)
1157 {
1158 src_base = src;
1159
1160 /* If SRC starts a composition, encode the information about the
1161 composition in advance. */
1162 if (coding->cmp_data_start < coding->cmp_data->used
1163 && char_offset + coding->consumed_char == data[1])
1164 {
1165 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1166 char_offset = coding->cmp_data->char_offset;
1167 data = coding->cmp_data->data + coding->cmp_data_start;
1168 }
1169
1170 ONE_MORE_CHAR (c);
1171 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1172 || coding->eol_type == CODING_EOL_CR))
1173 {
1174 if (coding->eol_type == CODING_EOL_CRLF)
1175 EMIT_TWO_BYTES ('\r', c);
1176 else
1177 EMIT_ONE_BYTE ('\r');
1178 }
1179 else if (SINGLE_BYTE_CHAR_P (c))
1180 {
1181 if (coding->flags && ! ASCII_BYTE_P (c))
1182 {
1183 /* As we are auto saving, retain the multibyte form for
1184 8-bit chars. */
1185 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1186 int bytes = CHAR_STRING (c, buf);
1187
1188 if (bytes == 1)
1189 EMIT_ONE_BYTE (buf[0]);
1190 else
1191 EMIT_TWO_BYTES (buf[0], buf[1]);
1192 }
1193 else
1194 EMIT_ONE_BYTE (c);
1195 }
1196 else
1197 EMIT_BYTES (src_base, src);
1198 coding->consumed_char++;
1199 }
1200 label_end_of_loop:
1201 coding->consumed = src_base - source;
1202 coding->produced = coding->produced_char = dst - destination;
1203 return;
1204 }
1205
1206 \f
1207 /*** 3. ISO2022 handlers ***/
1208
1209 /* The following note describes the coding system ISO2022 briefly.
1210 Since the intention of this note is to help understand the
1211 functions in this file, some parts are NOT ACCURATE or are OVERLY
1212 SIMPLIFIED. For thorough understanding, please refer to the
1213 original document of ISO2022. This is equivalent to the standard
1214 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1215
1216 ISO2022 provides many mechanisms to encode several character sets
1217 in 7-bit and 8-bit environments. For 7-bit environments, all text
1218 is encoded using bytes less than 128. This may make the encoded
1219 text a little bit longer, but the text passes more easily through
1220 several types of gateway, some of which strip off the MSB (Most
1221 Significant Bit).
1222
1223 There are two kinds of character sets: control character sets and
1224 graphic character sets. The former contain control characters such
1225 as `newline' and `escape' to provide control functions (control
1226 functions are also provided by escape sequences). The latter
1227 contain graphic characters such as 'A' and '-'. Emacs recognizes
1228 two control character sets and many graphic character sets.
1229
1230 Graphic character sets are classified into one of the following
1231 four classes, according to the number of bytes (DIMENSION) and
1232 number of characters in one dimension (CHARS) of the set:
1233 - DIMENSION1_CHARS94
1234 - DIMENSION1_CHARS96
1235 - DIMENSION2_CHARS94
1236 - DIMENSION2_CHARS96
1237
1238 In addition, each character set is assigned an identification tag,
1239 unique for each set, called the "final character" (denoted as <F>
1240 hereafter). The <F> of each character set is decided by ECMA(*)
1241 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1242 (0x30..0x3F are for private use only).
1243
1244 Note (*): ECMA = European Computer Manufacturers Association
1245
1246 Here are examples of graphic character sets [NAME(<F>)]:
1247 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1248 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1249 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1250 o DIMENSION2_CHARS96 -- none for the moment
1251
1252 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1253 C0 [0x00..0x1F] -- control character plane 0
1254 GL [0x20..0x7F] -- graphic character plane 0
1255 C1 [0x80..0x9F] -- control character plane 1
1256 GR [0xA0..0xFF] -- graphic character plane 1
1257
1258 A control character set is directly designated and invoked to C0 or
1259 C1 by an escape sequence. The most common case is that:
1260 - ISO646's control character set is designated/invoked to C0, and
1261 - ISO6429's control character set is designated/invoked to C1,
1262 and usually these designations/invocations are omitted in encoded
1263 text. In a 7-bit environment, only C0 can be used, and a control
1264 character for C1 is encoded by an appropriate escape sequence to
1265 fit into the environment. All control characters for C1 are
1266 defined to have corresponding escape sequences.
1267
1268 A graphic character set is at first designated to one of four
1269 graphic registers (G0 through G3), then these graphic registers are
1270 invoked to GL or GR. These designations and invocations can be
1271 done independently. The most common case is that G0 is invoked to
1272 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1273 these invocations and designations are omitted in encoded text.
1274 In a 7-bit environment, only GL can be used.
1275
1276 When a graphic character set of CHARS94 is invoked to GL, codes
1277 0x20 and 0x7F of the GL area work as control characters SPACE and
1278 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1279 be used.
1280
1281 There are two ways of invocation: locking-shift and single-shift.
1282 With locking-shift, the invocation lasts until the next different
1283 invocation, whereas with single-shift, the invocation affects the
1284 following character only and doesn't affect the locking-shift
1285 state. Invocations are done by the following control characters or
1286 escape sequences:
1287
1288 ----------------------------------------------------------------------
1289 abbrev function cntrl escape seq description
1290 ----------------------------------------------------------------------
1291 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1292 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1293 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1294 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1295 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1296 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1297 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1298 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1299 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1300 ----------------------------------------------------------------------
1301 (*) These are not used by any known coding system.
1302
1303 Control characters for these functions are defined by macros
1304 ISO_CODE_XXX in `coding.h'.
1305
1306 Designations are done by the following escape sequences:
1307 ----------------------------------------------------------------------
1308 escape sequence description
1309 ----------------------------------------------------------------------
1310 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1311 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1312 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1313 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1314 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1315 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1316 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1317 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1318 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1319 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1320 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1321 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1322 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1323 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1324 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1325 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1326 ----------------------------------------------------------------------
1327
1328 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1329 of dimension 1, chars 94, and final character <F>, etc...
1330
1331 Note (*): Although these designations are not allowed in ISO2022,
1332 Emacs accepts them on decoding, and produces them on encoding
1333 CHARS96 character sets in a coding system which is characterized as
1334 7-bit environment, non-locking-shift, and non-single-shift.
1335
1336 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1337 '(' can be omitted. We refer to this as "short-form" hereafter.
1338
1339 Now you may notice that there are a lot of ways of encoding the
1340 same multilingual text in ISO2022. Actually, there exist many
1341 coding systems such as Compound Text (used in X11's inter client
1342 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1343 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1344 localized platforms), and all of these are variants of ISO2022.
1345
1346 In addition to the above, Emacs handles two more kinds of escape
1347 sequences: ISO6429's direction specification and Emacs' private
1348 sequence for specifying character composition.
1349
1350 ISO6429's direction specification takes the following form:
1351 o CSI ']' -- end of the current direction
1352 o CSI '0' ']' -- end of the current direction
1353 o CSI '1' ']' -- start of left-to-right text
1354 o CSI '2' ']' -- start of right-to-left text
1355 The control character CSI (0x9B: control sequence introducer) is
1356 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1357
1358 Character composition specification takes the following form:
1359 o ESC '0' -- start relative composition
1360 o ESC '1' -- end composition
1361 o ESC '2' -- start rule-base composition (*)
1362 o ESC '3' -- start relative composition with alternate chars (**)
1363 o ESC '4' -- start rule-base composition with alternate chars (**)
1364 Since these are not standard escape sequences of any ISO standard,
1365 the use of them with these meanings is restricted to Emacs only.
1366
1367 (*) This form is used only in Emacs 20.5 and older versions,
1368 but the newer versions can safely decode it.
1369 (**) This form is used only in Emacs 21.1 and newer versions,
1370 and the older versions can't decode it.
1371
1372 Here's a list of example usages of these composition escape
1373 sequences (categorized by `enum composition_method').
1374
1375 COMPOSITION_RELATIVE:
1376 ESC 0 CHAR [ CHAR ] ESC 1
1377 COMPOSITION_WITH_RULE:
1378 ESC 2 CHAR [ RULE CHAR ] ESC 1
1379 COMPOSITION_WITH_ALTCHARS:
1380 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1381 COMPOSITION_WITH_RULE_ALTCHARS:
1382 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1383
1384 enum iso_code_class_type iso_code_class[256];
1385
1386 #define CHARSET_OK(idx, charset, c) \
1387 (coding_system_table[idx] \
1388 && (charset == CHARSET_ASCII \
1389 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1390 CODING_SAFE_CHAR_P (safe_chars, c))) \
1391 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1392 charset) \
1393 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1394
1395 #define SHIFT_OUT_OK(idx) \
1396 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1397
1398 #define COMPOSITION_OK(idx) \
1399 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1400
1401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1402 Check if a text is encoded in ISO2022. If it is, return an
1403 integer in which appropriate flag bits any of:
1404 CODING_CATEGORY_MASK_ISO_7
1405 CODING_CATEGORY_MASK_ISO_7_TIGHT
1406 CODING_CATEGORY_MASK_ISO_8_1
1407 CODING_CATEGORY_MASK_ISO_8_2
1408 CODING_CATEGORY_MASK_ISO_7_ELSE
1409 CODING_CATEGORY_MASK_ISO_8_ELSE
1410 are set. If a code which should never appear in ISO2022 is found,
1411 returns 0. */
1412
1413 static int
1414 detect_coding_iso2022 (src, src_end, multibytep)
1415 unsigned char *src, *src_end;
1416 int multibytep;
1417 {
1418 int mask = CODING_CATEGORY_MASK_ISO;
1419 int mask_found = 0;
1420 int reg[4], shift_out = 0, single_shifting = 0;
1421 int c, c1, charset;
1422 /* Dummy for ONE_MORE_BYTE. */
1423 struct coding_system dummy_coding;
1424 struct coding_system *coding = &dummy_coding;
1425 Lisp_Object safe_chars;
1426
1427 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1428 while (mask && src < src_end)
1429 {
1430 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1431 retry:
1432 switch (c)
1433 {
1434 case ISO_CODE_ESC:
1435 if (inhibit_iso_escape_detection)
1436 break;
1437 single_shifting = 0;
1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1439 if (c >= '(' && c <= '/')
1440 {
1441 /* Designation sequence for a charset of dimension 1. */
1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1443 if (c1 < ' ' || c1 >= 0x80
1444 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1445 /* Invalid designation sequence. Just ignore. */
1446 break;
1447 reg[(c - '(') % 4] = charset;
1448 }
1449 else if (c == '$')
1450 {
1451 /* Designation sequence for a charset of dimension 2. */
1452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1453 if (c >= '@' && c <= 'B')
1454 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1455 reg[0] = charset = iso_charset_table[1][0][c];
1456 else if (c >= '(' && c <= '/')
1457 {
1458 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1459 if (c1 < ' ' || c1 >= 0x80
1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461 /* Invalid designation sequence. Just ignore. */
1462 break;
1463 reg[(c - '(') % 4] = charset;
1464 }
1465 else
1466 /* Invalid designation sequence. Just ignore. */
1467 break;
1468 }
1469 else if (c == 'N' || c == 'O')
1470 {
1471 /* ESC <Fe> for SS2 or SS3. */
1472 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1473 break;
1474 }
1475 else if (c >= '0' && c <= '4')
1476 {
1477 /* ESC <Fp> for start/end composition. */
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1498 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1499 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1500 else
1501 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1502 break;
1503 }
1504 else
1505 /* Invalid escape sequence. Just ignore. */
1506 break;
1507
1508 /* We found a valid designation sequence for CHARSET. */
1509 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1510 c = MAKE_CHAR (charset, 0, 0);
1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1523 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1524 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1525 else
1526 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1527 break;
1528
1529 case ISO_CODE_SO:
1530 if (inhibit_iso_escape_detection)
1531 break;
1532 single_shifting = 0;
1533 if (shift_out == 0
1534 && (reg[1] >= 0
1535 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1536 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1537 {
1538 /* Locking shift out. */
1539 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541 }
1542 break;
1543
1544 case ISO_CODE_SI:
1545 if (inhibit_iso_escape_detection)
1546 break;
1547 single_shifting = 0;
1548 if (shift_out == 1)
1549 {
1550 /* Locking shift in. */
1551 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1552 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1553 }
1554 break;
1555
1556 case ISO_CODE_CSI:
1557 single_shifting = 0;
1558 case ISO_CODE_SS2:
1559 case ISO_CODE_SS3:
1560 {
1561 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1562
1563 if (inhibit_iso_escape_detection)
1564 break;
1565 if (c != ISO_CODE_CSI)
1566 {
1567 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568 & CODING_FLAG_ISO_SINGLE_SHIFT)
1569 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571 & CODING_FLAG_ISO_SINGLE_SHIFT)
1572 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573 single_shifting = 1;
1574 }
1575 if (VECTORP (Vlatin_extra_code_table)
1576 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1577 {
1578 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1579 & CODING_FLAG_ISO_LATIN_EXTRA)
1580 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1581 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1582 & CODING_FLAG_ISO_LATIN_EXTRA)
1583 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1584 }
1585 mask &= newmask;
1586 mask_found |= newmask;
1587 }
1588 break;
1589
1590 default:
1591 if (c < 0x80)
1592 {
1593 single_shifting = 0;
1594 break;
1595 }
1596 else if (c < 0xA0)
1597 {
1598 single_shifting = 0;
1599 if (VECTORP (Vlatin_extra_code_table)
1600 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1601 {
1602 int newmask = 0;
1603
1604 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1605 & CODING_FLAG_ISO_LATIN_EXTRA)
1606 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1607 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1608 & CODING_FLAG_ISO_LATIN_EXTRA)
1609 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1610 mask &= newmask;
1611 mask_found |= newmask;
1612 }
1613 else
1614 return 0;
1615 }
1616 else
1617 {
1618 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1619 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1620 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1621 /* Check the length of succeeding codes of the range
1622 0xA0..0FF. If the byte length is odd, we exclude
1623 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1624 when we are not single shifting. */
1625 if (!single_shifting
1626 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1627 {
1628 int i = 1;
1629
1630 c = -1;
1631 while (src < src_end)
1632 {
1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1634 if (c < 0xA0)
1635 break;
1636 i++;
1637 }
1638
1639 if (i & 1 && src < src_end)
1640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1641 else
1642 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1643 if (c >= 0)
1644 /* This means that we have read one extra byte. */
1645 goto retry;
1646 }
1647 }
1648 break;
1649 }
1650 }
1651 label_end_of_loop:
1652 return (mask & mask_found);
1653 }
1654
1655 /* Decode a character of which charset is CHARSET, the 1st position
1656 code is C1, the 2nd position code is C2, and return the decoded
1657 character code. If the variable `translation_table' is non-nil,
1658 returned the translated code. */
1659
1660 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1661 (NILP (translation_table) \
1662 ? MAKE_CHAR (charset, c1, c2) \
1663 : translate_char (translation_table, -1, charset, c1, c2))
1664
1665 /* Set designation state into CODING. */
1666 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1667 do { \
1668 int charset, c; \
1669 \
1670 if (final_char < '0' || final_char >= 128) \
1671 goto label_invalid_code; \
1672 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1673 make_number (chars), \
1674 make_number (final_char)); \
1675 c = MAKE_CHAR (charset, 0, 0); \
1676 if (charset >= 0 \
1677 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1678 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1679 { \
1680 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1681 && reg == 0 \
1682 && charset == CHARSET_ASCII) \
1683 { \
1684 /* We should insert this designation sequence as is so \
1685 that it is surely written back to a file. */ \
1686 coding->spec.iso2022.last_invalid_designation_register = -1; \
1687 goto label_invalid_code; \
1688 } \
1689 coding->spec.iso2022.last_invalid_designation_register = -1; \
1690 if ((coding->mode & CODING_MODE_DIRECTION) \
1691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1692 charset = CHARSET_REVERSE_CHARSET (charset); \
1693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1694 } \
1695 else \
1696 { \
1697 coding->spec.iso2022.last_invalid_designation_register = reg; \
1698 goto label_invalid_code; \
1699 } \
1700 } while (0)
1701
1702 /* Allocate a memory block for storing information about compositions.
1703 The block is chained to the already allocated blocks. */
1704
1705 void
1706 coding_allocate_composition_data (coding, char_offset)
1707 struct coding_system *coding;
1708 int char_offset;
1709 {
1710 struct composition_data *cmp_data
1711 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1712
1713 cmp_data->char_offset = char_offset;
1714 cmp_data->used = 0;
1715 cmp_data->prev = coding->cmp_data;
1716 cmp_data->next = NULL;
1717 if (coding->cmp_data)
1718 coding->cmp_data->next = cmp_data;
1719 coding->cmp_data = cmp_data;
1720 coding->cmp_data_start = 0;
1721 coding->composing = COMPOSITION_NO;
1722 }
1723
1724 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1725 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1726 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1727 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1728 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1729 */
1730
1731 #define DECODE_COMPOSITION_START(c1) \
1732 do { \
1733 if (coding->composing == COMPOSITION_DISABLED) \
1734 { \
1735 *dst++ = ISO_CODE_ESC; \
1736 *dst++ = c1 & 0x7f; \
1737 coding->produced_char += 2; \
1738 } \
1739 else if (!COMPOSING_P (coding)) \
1740 { \
1741 /* This is surely the start of a composition. We must be sure \
1742 that coding->cmp_data has enough space to store the \
1743 information about the composition. If not, terminate the \
1744 current decoding loop, allocate one more memory block for \
1745 coding->cmp_data in the caller, then start the decoding \
1746 loop again. We can't allocate memory here directly because \
1747 it may cause buffer/string relocation. */ \
1748 if (!coding->cmp_data \
1749 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1750 >= COMPOSITION_DATA_SIZE)) \
1751 { \
1752 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1753 goto label_end_of_loop; \
1754 } \
1755 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1756 : c1 == '2' ? COMPOSITION_WITH_RULE \
1757 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1758 : COMPOSITION_WITH_RULE_ALTCHARS); \
1759 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1760 coding->composing); \
1761 coding->composition_rule_follows = 0; \
1762 } \
1763 else \
1764 { \
1765 /* We are already handling a composition. If the method is \
1766 the following two, the codes following the current escape \
1767 sequence are actual characters stored in a buffer. */ \
1768 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1769 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1770 { \
1771 coding->composing = COMPOSITION_RELATIVE; \
1772 coding->composition_rule_follows = 0; \
1773 } \
1774 } \
1775 } while (0)
1776
1777 /* Handle composition end sequence ESC 1. */
1778
1779 #define DECODE_COMPOSITION_END(c1) \
1780 do { \
1781 if (! COMPOSING_P (coding)) \
1782 { \
1783 *dst++ = ISO_CODE_ESC; \
1784 *dst++ = c1; \
1785 coding->produced_char += 2; \
1786 } \
1787 else \
1788 { \
1789 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1790 coding->composing = COMPOSITION_NO; \
1791 } \
1792 } while (0)
1793
1794 /* Decode a composition rule from the byte C1 (and maybe one more byte
1795 from SRC) and store one encoded composition rule in
1796 coding->cmp_data. */
1797
1798 #define DECODE_COMPOSITION_RULE(c1) \
1799 do { \
1800 int rule = 0; \
1801 (c1) -= 32; \
1802 if (c1 < 81) /* old format (before ver.21) */ \
1803 { \
1804 int gref = (c1) / 9; \
1805 int nref = (c1) % 9; \
1806 if (gref == 4) gref = 10; \
1807 if (nref == 4) nref = 10; \
1808 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1809 } \
1810 else if (c1 < 93) /* new format (after ver.21) */ \
1811 { \
1812 ONE_MORE_BYTE (c2); \
1813 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1814 } \
1815 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1816 coding->composition_rule_follows = 0; \
1817 } while (0)
1818
1819
1820 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1821
1822 static void
1823 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1824 struct coding_system *coding;
1825 const unsigned char *source;
1826 unsigned char *destination;
1827 int src_bytes, dst_bytes;
1828 {
1829 const unsigned char *src = source;
1830 const unsigned char *src_end = source + src_bytes;
1831 unsigned char *dst = destination;
1832 unsigned char *dst_end = destination + dst_bytes;
1833 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1834 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1835 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1836 /* SRC_BASE remembers the start position in source in each loop.
1837 The loop will be exited when there's not enough source code
1838 (within macro ONE_MORE_BYTE), or when there's not enough
1839 destination area to produce a character (within macro
1840 EMIT_CHAR). */
1841 const unsigned char *src_base;
1842 int c, charset;
1843 Lisp_Object translation_table;
1844 Lisp_Object safe_chars;
1845
1846 safe_chars = coding_safe_chars (coding->symbol);
1847
1848 if (NILP (Venable_character_translation))
1849 translation_table = Qnil;
1850 else
1851 {
1852 translation_table = coding->translation_table_for_decode;
1853 if (NILP (translation_table))
1854 translation_table = Vstandard_translation_table_for_decode;
1855 }
1856
1857 coding->result = CODING_FINISH_NORMAL;
1858
1859 while (1)
1860 {
1861 int c1, c2 = 0;
1862
1863 src_base = src;
1864 ONE_MORE_BYTE (c1);
1865
1866 /* We produce no character or one character. */
1867 switch (iso_code_class [c1])
1868 {
1869 case ISO_0x20_or_0x7F:
1870 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1871 {
1872 DECODE_COMPOSITION_RULE (c1);
1873 continue;
1874 }
1875 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1876 {
1877 /* This is SPACE or DEL. */
1878 charset = CHARSET_ASCII;
1879 break;
1880 }
1881 /* This is a graphic character, we fall down ... */
1882
1883 case ISO_graphic_plane_0:
1884 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885 {
1886 DECODE_COMPOSITION_RULE (c1);
1887 continue;
1888 }
1889 charset = charset0;
1890 break;
1891
1892 case ISO_0xA0_or_0xFF:
1893 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1894 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1895 goto label_invalid_code;
1896 /* This is a graphic character, we fall down ... */
1897
1898 case ISO_graphic_plane_1:
1899 if (charset1 < 0)
1900 goto label_invalid_code;
1901 charset = charset1;
1902 break;
1903
1904 case ISO_control_0:
1905 if (COMPOSING_P (coding))
1906 DECODE_COMPOSITION_END ('1');
1907
1908 /* All ISO2022 control characters in this class have the
1909 same representation in Emacs internal format. */
1910 if (c1 == '\n'
1911 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1912 && (coding->eol_type == CODING_EOL_CR
1913 || coding->eol_type == CODING_EOL_CRLF))
1914 {
1915 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1916 goto label_end_of_loop;
1917 }
1918 charset = CHARSET_ASCII;
1919 break;
1920
1921 case ISO_control_1:
1922 if (COMPOSING_P (coding))
1923 DECODE_COMPOSITION_END ('1');
1924 goto label_invalid_code;
1925
1926 case ISO_carriage_return:
1927 if (COMPOSING_P (coding))
1928 DECODE_COMPOSITION_END ('1');
1929
1930 if (coding->eol_type == CODING_EOL_CR)
1931 c1 = '\n';
1932 else if (coding->eol_type == CODING_EOL_CRLF)
1933 {
1934 ONE_MORE_BYTE (c1);
1935 if (c1 != ISO_CODE_LF)
1936 {
1937 src--;
1938 c1 = '\r';
1939 }
1940 }
1941 charset = CHARSET_ASCII;
1942 break;
1943
1944 case ISO_shift_out:
1945 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1946 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1947 goto label_invalid_code;
1948 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1949 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950 continue;
1951
1952 case ISO_shift_in:
1953 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1954 goto label_invalid_code;
1955 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1956 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1957 continue;
1958
1959 case ISO_single_shift_2_7:
1960 case ISO_single_shift_2:
1961 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962 goto label_invalid_code;
1963 /* SS2 is handled as an escape sequence of ESC 'N' */
1964 c1 = 'N';
1965 goto label_escape_sequence;
1966
1967 case ISO_single_shift_3:
1968 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1969 goto label_invalid_code;
1970 /* SS2 is handled as an escape sequence of ESC 'O' */
1971 c1 = 'O';
1972 goto label_escape_sequence;
1973
1974 case ISO_control_sequence_introducer:
1975 /* CSI is handled as an escape sequence of ESC '[' ... */
1976 c1 = '[';
1977 goto label_escape_sequence;
1978
1979 case ISO_escape:
1980 ONE_MORE_BYTE (c1);
1981 label_escape_sequence:
1982 /* Escape sequences handled by Emacs are invocation,
1983 designation, direction specification, and character
1984 composition specification. */
1985 switch (c1)
1986 {
1987 case '&': /* revision of following character set */
1988 ONE_MORE_BYTE (c1);
1989 if (!(c1 >= '@' && c1 <= '~'))
1990 goto label_invalid_code;
1991 ONE_MORE_BYTE (c1);
1992 if (c1 != ISO_CODE_ESC)
1993 goto label_invalid_code;
1994 ONE_MORE_BYTE (c1);
1995 goto label_escape_sequence;
1996
1997 case '$': /* designation of 2-byte character set */
1998 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999 goto label_invalid_code;
2000 ONE_MORE_BYTE (c1);
2001 if (c1 >= '@' && c1 <= 'B')
2002 { /* designation of JISX0208.1978, GB2312.1980,
2003 or JISX0208.1980 */
2004 DECODE_DESIGNATION (0, 2, 94, c1);
2005 }
2006 else if (c1 >= 0x28 && c1 <= 0x2B)
2007 { /* designation of DIMENSION2_CHARS94 character set */
2008 ONE_MORE_BYTE (c2);
2009 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2010 }
2011 else if (c1 >= 0x2C && c1 <= 0x2F)
2012 { /* designation of DIMENSION2_CHARS96 character set */
2013 ONE_MORE_BYTE (c2);
2014 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2015 }
2016 else
2017 goto label_invalid_code;
2018 /* We must update these variables now. */
2019 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2021 continue;
2022
2023 case 'n': /* invocation of locking-shift-2 */
2024 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2025 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2026 goto label_invalid_code;
2027 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2028 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2029 continue;
2030
2031 case 'o': /* invocation of locking-shift-3 */
2032 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2033 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2034 goto label_invalid_code;
2035 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2036 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2037 continue;
2038
2039 case 'N': /* invocation of single-shift-2 */
2040 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2041 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2042 goto label_invalid_code;
2043 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2044 ONE_MORE_BYTE (c1);
2045 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2046 goto label_invalid_code;
2047 break;
2048
2049 case 'O': /* invocation of single-shift-3 */
2050 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2051 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052 goto label_invalid_code;
2053 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2054 ONE_MORE_BYTE (c1);
2055 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2056 goto label_invalid_code;
2057 break;
2058
2059 case '0': case '2': case '3': case '4': /* start composition */
2060 DECODE_COMPOSITION_START (c1);
2061 continue;
2062
2063 case '1': /* end composition */
2064 DECODE_COMPOSITION_END (c1);
2065 continue;
2066
2067 case '[': /* specification of direction */
2068 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2069 goto label_invalid_code;
2070 /* For the moment, nested direction is not supported.
2071 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2072 left-to-right, and nonzero means right-to-left. */
2073 ONE_MORE_BYTE (c1);
2074 switch (c1)
2075 {
2076 case ']': /* end of the current direction */
2077 coding->mode &= ~CODING_MODE_DIRECTION;
2078
2079 case '0': /* end of the current direction */
2080 case '1': /* start of left-to-right direction */
2081 ONE_MORE_BYTE (c1);
2082 if (c1 == ']')
2083 coding->mode &= ~CODING_MODE_DIRECTION;
2084 else
2085 goto label_invalid_code;
2086 break;
2087
2088 case '2': /* start of right-to-left direction */
2089 ONE_MORE_BYTE (c1);
2090 if (c1 == ']')
2091 coding->mode |= CODING_MODE_DIRECTION;
2092 else
2093 goto label_invalid_code;
2094 break;
2095
2096 default:
2097 goto label_invalid_code;
2098 }
2099 continue;
2100
2101 case '%':
2102 if (COMPOSING_P (coding))
2103 DECODE_COMPOSITION_END ('1');
2104 ONE_MORE_BYTE (c1);
2105 if (c1 == '/')
2106 {
2107 /* CTEXT extended segment:
2108 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2109 We keep these bytes as is for the moment.
2110 They may be decoded by post-read-conversion. */
2111 int dim, M, L;
2112 int size, required;
2113 int produced_chars;
2114
2115 ONE_MORE_BYTE (dim);
2116 ONE_MORE_BYTE (M);
2117 ONE_MORE_BYTE (L);
2118 size = ((M - 128) * 128) + (L - 128);
2119 required = 8 + size * 2;
2120 if (dst + required > (dst_bytes ? dst_end : src))
2121 goto label_end_of_loop;
2122 *dst++ = ISO_CODE_ESC;
2123 *dst++ = '%';
2124 *dst++ = '/';
2125 *dst++ = dim;
2126 produced_chars = 4;
2127 dst += CHAR_STRING (M, dst), produced_chars++;
2128 dst += CHAR_STRING (L, dst), produced_chars++;
2129 while (size-- > 0)
2130 {
2131 ONE_MORE_BYTE (c1);
2132 dst += CHAR_STRING (c1, dst), produced_chars++;
2133 }
2134 coding->produced_char += produced_chars;
2135 }
2136 else if (c1 == 'G')
2137 {
2138 unsigned char *d = dst;
2139 int produced_chars;
2140
2141 /* XFree86 extension for embedding UTF-8 in CTEXT:
2142 ESC % G --UTF-8-BYTES-- ESC % @
2143 We keep these bytes as is for the moment.
2144 They may be decoded by post-read-conversion. */
2145 if (d + 6 > (dst_bytes ? dst_end : src))
2146 goto label_end_of_loop;
2147 *d++ = ISO_CODE_ESC;
2148 *d++ = '%';
2149 *d++ = 'G';
2150 produced_chars = 3;
2151 while (d + 1 < (dst_bytes ? dst_end : src))
2152 {
2153 ONE_MORE_BYTE (c1);
2154 if (c1 == ISO_CODE_ESC
2155 && src + 1 < src_end
2156 && src[0] == '%'
2157 && src[1] == '@')
2158 {
2159 src += 2;
2160 break;
2161 }
2162 d += CHAR_STRING (c1, d), produced_chars++;
2163 }
2164 if (d + 3 > (dst_bytes ? dst_end : src))
2165 goto label_end_of_loop;
2166 *d++ = ISO_CODE_ESC;
2167 *d++ = '%';
2168 *d++ = '@';
2169 dst = d;
2170 coding->produced_char += produced_chars + 3;
2171 }
2172 else
2173 goto label_invalid_code;
2174 continue;
2175
2176 default:
2177 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2178 goto label_invalid_code;
2179 if (c1 >= 0x28 && c1 <= 0x2B)
2180 { /* designation of DIMENSION1_CHARS94 character set */
2181 ONE_MORE_BYTE (c2);
2182 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2183 }
2184 else if (c1 >= 0x2C && c1 <= 0x2F)
2185 { /* designation of DIMENSION1_CHARS96 character set */
2186 ONE_MORE_BYTE (c2);
2187 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2188 }
2189 else
2190 goto label_invalid_code;
2191 /* We must update these variables now. */
2192 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2193 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2194 continue;
2195 }
2196 }
2197
2198 /* Now we know CHARSET and 1st position code C1 of a character.
2199 Produce a multibyte sequence for that character while getting
2200 2nd position code C2 if necessary. */
2201 if (CHARSET_DIMENSION (charset) == 2)
2202 {
2203 ONE_MORE_BYTE (c2);
2204 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2205 /* C2 is not in a valid range. */
2206 goto label_invalid_code;
2207 }
2208 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2209 EMIT_CHAR (c);
2210 continue;
2211
2212 label_invalid_code:
2213 coding->errors++;
2214 if (COMPOSING_P (coding))
2215 DECODE_COMPOSITION_END ('1');
2216 src = src_base;
2217 c = *src++;
2218 if (! NILP (translation_table))
2219 c = translate_char (translation_table, c, 0, 0, 0);
2220 EMIT_CHAR (c);
2221 }
2222
2223 label_end_of_loop:
2224 coding->consumed = coding->consumed_char = src_base - source;
2225 coding->produced = dst - destination;
2226 return;
2227 }
2228
2229
2230 /* ISO2022 encoding stuff. */
2231
2232 /*
2233 It is not enough to say just "ISO2022" on encoding, we have to
2234 specify more details. In Emacs, each ISO2022 coding system
2235 variant has the following specifications:
2236 1. Initial designation to G0 through G3.
2237 2. Allows short-form designation?
2238 3. ASCII should be designated to G0 before control characters?
2239 4. ASCII should be designated to G0 at end of line?
2240 5. 7-bit environment or 8-bit environment?
2241 6. Use locking-shift?
2242 7. Use Single-shift?
2243 And the following two are only for Japanese:
2244 8. Use ASCII in place of JIS0201-1976-Roman?
2245 9. Use JISX0208-1983 in place of JISX0208-1978?
2246 These specifications are encoded in `coding->flags' as flag bits
2247 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2248 details.
2249 */
2250
2251 /* Produce codes (escape sequence) for designating CHARSET to graphic
2252 register REG at DST, and increment DST. If <final-char> of CHARSET is
2253 '@', 'A', or 'B' and the coding system CODING allows, produce
2254 designation sequence of short-form. */
2255
2256 #define ENCODE_DESIGNATION(charset, reg, coding) \
2257 do { \
2258 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2259 char *intermediate_char_94 = "()*+"; \
2260 char *intermediate_char_96 = ",-./"; \
2261 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2262 \
2263 if (revision < 255) \
2264 { \
2265 *dst++ = ISO_CODE_ESC; \
2266 *dst++ = '&'; \
2267 *dst++ = '@' + revision; \
2268 } \
2269 *dst++ = ISO_CODE_ESC; \
2270 if (CHARSET_DIMENSION (charset) == 1) \
2271 { \
2272 if (CHARSET_CHARS (charset) == 94) \
2273 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2274 else \
2275 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2276 } \
2277 else \
2278 { \
2279 *dst++ = '$'; \
2280 if (CHARSET_CHARS (charset) == 94) \
2281 { \
2282 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2283 || reg != 0 \
2284 || final_char < '@' || final_char > 'B') \
2285 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2286 } \
2287 else \
2288 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2289 } \
2290 *dst++ = final_char; \
2291 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2292 } while (0)
2293
2294 /* The following two macros produce codes (control character or escape
2295 sequence) for ISO2022 single-shift functions (single-shift-2 and
2296 single-shift-3). */
2297
2298 #define ENCODE_SINGLE_SHIFT_2 \
2299 do { \
2300 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2301 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2302 else \
2303 *dst++ = ISO_CODE_SS2; \
2304 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2305 } while (0)
2306
2307 #define ENCODE_SINGLE_SHIFT_3 \
2308 do { \
2309 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2310 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2311 else \
2312 *dst++ = ISO_CODE_SS3; \
2313 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2314 } while (0)
2315
2316 /* The following four macros produce codes (control character or
2317 escape sequence) for ISO2022 locking-shift functions (shift-in,
2318 shift-out, locking-shift-2, and locking-shift-3). */
2319
2320 #define ENCODE_SHIFT_IN \
2321 do { \
2322 *dst++ = ISO_CODE_SI; \
2323 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2324 } while (0)
2325
2326 #define ENCODE_SHIFT_OUT \
2327 do { \
2328 *dst++ = ISO_CODE_SO; \
2329 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2330 } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_2 \
2333 do { \
2334 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2335 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2336 } while (0)
2337
2338 #define ENCODE_LOCKING_SHIFT_3 \
2339 do { \
2340 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2341 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2342 } while (0)
2343
2344 /* Produce codes for a DIMENSION1 character whose character set is
2345 CHARSET and whose position-code is C1. Designation and invocation
2346 sequences are also produced in advance if necessary. */
2347
2348 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2349 do { \
2350 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2351 { \
2352 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2353 *dst++ = c1 & 0x7F; \
2354 else \
2355 *dst++ = c1 | 0x80; \
2356 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2357 break; \
2358 } \
2359 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2360 { \
2361 *dst++ = c1 & 0x7F; \
2362 break; \
2363 } \
2364 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2365 { \
2366 *dst++ = c1 | 0x80; \
2367 break; \
2368 } \
2369 else \
2370 /* Since CHARSET is not yet invoked to any graphic planes, we \
2371 must invoke it, or, at first, designate it to some graphic \
2372 register. Then repeat the loop to actually produce the \
2373 character. */ \
2374 dst = encode_invocation_designation (charset, coding, dst); \
2375 } while (1)
2376
2377 /* Produce codes for a DIMENSION2 character whose character set is
2378 CHARSET and whose position-codes are C1 and C2. Designation and
2379 invocation codes are also produced in advance if necessary. */
2380
2381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2382 do { \
2383 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2384 { \
2385 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2386 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2387 else \
2388 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2389 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2390 break; \
2391 } \
2392 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2393 { \
2394 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2395 break; \
2396 } \
2397 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2398 { \
2399 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2400 break; \
2401 } \
2402 else \
2403 /* Since CHARSET is not yet invoked to any graphic planes, we \
2404 must invoke it, or, at first, designate it to some graphic \
2405 register. Then repeat the loop to actually produce the \
2406 character. */ \
2407 dst = encode_invocation_designation (charset, coding, dst); \
2408 } while (1)
2409
2410 #define ENCODE_ISO_CHARACTER(c) \
2411 do { \
2412 int charset, c1, c2; \
2413 \
2414 SPLIT_CHAR (c, charset, c1, c2); \
2415 if (CHARSET_DEFINED_P (charset)) \
2416 { \
2417 if (CHARSET_DIMENSION (charset) == 1) \
2418 { \
2419 if (charset == CHARSET_ASCII \
2420 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2421 charset = charset_latin_jisx0201; \
2422 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2423 } \
2424 else \
2425 { \
2426 if (charset == charset_jisx0208 \
2427 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2428 charset = charset_jisx0208_1978; \
2429 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2430 } \
2431 } \
2432 else \
2433 { \
2434 *dst++ = c1; \
2435 if (c2 >= 0) \
2436 *dst++ = c2; \
2437 } \
2438 } while (0)
2439
2440
2441 /* Instead of encoding character C, produce one or two `?'s. */
2442
2443 #define ENCODE_UNSAFE_CHARACTER(c) \
2444 do { \
2445 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2446 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2447 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2448 } while (0)
2449
2450
2451 /* Produce designation and invocation codes at a place pointed by DST
2452 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2453 Return new DST. */
2454
2455 unsigned char *
2456 encode_invocation_designation (charset, coding, dst)
2457 int charset;
2458 struct coding_system *coding;
2459 unsigned char *dst;
2460 {
2461 int reg; /* graphic register number */
2462
2463 /* At first, check designations. */
2464 for (reg = 0; reg < 4; reg++)
2465 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2466 break;
2467
2468 if (reg >= 4)
2469 {
2470 /* CHARSET is not yet designated to any graphic registers. */
2471 /* At first check the requested designation. */
2472 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2473 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2474 /* Since CHARSET requests no special designation, designate it
2475 to graphic register 0. */
2476 reg = 0;
2477
2478 ENCODE_DESIGNATION (charset, reg, coding);
2479 }
2480
2481 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2482 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2483 {
2484 /* Since the graphic register REG is not invoked to any graphic
2485 planes, invoke it to graphic plane 0. */
2486 switch (reg)
2487 {
2488 case 0: /* graphic register 0 */
2489 ENCODE_SHIFT_IN;
2490 break;
2491
2492 case 1: /* graphic register 1 */
2493 ENCODE_SHIFT_OUT;
2494 break;
2495
2496 case 2: /* graphic register 2 */
2497 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498 ENCODE_SINGLE_SHIFT_2;
2499 else
2500 ENCODE_LOCKING_SHIFT_2;
2501 break;
2502
2503 case 3: /* graphic register 3 */
2504 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2505 ENCODE_SINGLE_SHIFT_3;
2506 else
2507 ENCODE_LOCKING_SHIFT_3;
2508 break;
2509 }
2510 }
2511
2512 return dst;
2513 }
2514
2515 /* Produce 2-byte codes for encoded composition rule RULE. */
2516
2517 #define ENCODE_COMPOSITION_RULE(rule) \
2518 do { \
2519 int gref, nref; \
2520 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2521 *dst++ = 32 + 81 + gref; \
2522 *dst++ = 32 + nref; \
2523 } while (0)
2524
2525 /* Produce codes for indicating the start of a composition sequence
2526 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2527 which specify information about the composition. See the comment
2528 in coding.h for the format of DATA. */
2529
2530 #define ENCODE_COMPOSITION_START(coding, data) \
2531 do { \
2532 coding->composing = data[3]; \
2533 *dst++ = ISO_CODE_ESC; \
2534 if (coding->composing == COMPOSITION_RELATIVE) \
2535 *dst++ = '0'; \
2536 else \
2537 { \
2538 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2539 ? '3' : '4'); \
2540 coding->cmp_data_index = coding->cmp_data_start + 4; \
2541 coding->composition_rule_follows = 0; \
2542 } \
2543 } while (0)
2544
2545 /* Produce codes for indicating the end of the current composition. */
2546
2547 #define ENCODE_COMPOSITION_END(coding, data) \
2548 do { \
2549 *dst++ = ISO_CODE_ESC; \
2550 *dst++ = '1'; \
2551 coding->cmp_data_start += data[0]; \
2552 coding->composing = COMPOSITION_NO; \
2553 if (coding->cmp_data_start == coding->cmp_data->used \
2554 && coding->cmp_data->next) \
2555 { \
2556 coding->cmp_data = coding->cmp_data->next; \
2557 coding->cmp_data_start = 0; \
2558 } \
2559 } while (0)
2560
2561 /* Produce composition start sequence ESC 0. Here, this sequence
2562 doesn't mean the start of a new composition but means that we have
2563 just produced components (alternate chars and composition rules) of
2564 the composition and the actual text follows in SRC. */
2565
2566 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2567 do { \
2568 *dst++ = ISO_CODE_ESC; \
2569 *dst++ = '0'; \
2570 coding->composing = COMPOSITION_RELATIVE; \
2571 } while (0)
2572
2573 /* The following three macros produce codes for indicating direction
2574 of text. */
2575 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2576 do { \
2577 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2578 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2579 else \
2580 *dst++ = ISO_CODE_CSI; \
2581 } while (0)
2582
2583 #define ENCODE_DIRECTION_R2L \
2584 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2585
2586 #define ENCODE_DIRECTION_L2R \
2587 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2588
2589 /* Produce codes for designation and invocation to reset the graphic
2590 planes and registers to initial state. */
2591 #define ENCODE_RESET_PLANE_AND_REGISTER \
2592 do { \
2593 int reg; \
2594 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2595 ENCODE_SHIFT_IN; \
2596 for (reg = 0; reg < 4; reg++) \
2597 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2598 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2599 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2600 ENCODE_DESIGNATION \
2601 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2602 } while (0)
2603
2604 /* Produce designation sequences of charsets in the line started from
2605 SRC to a place pointed by DST, and return updated DST.
2606
2607 If the current block ends before any end-of-line, we may fail to
2608 find all the necessary designations. */
2609
2610 static unsigned char *
2611 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2612 struct coding_system *coding;
2613 Lisp_Object translation_table;
2614 const unsigned char *src, *src_end;
2615 unsigned char *dst;
2616 {
2617 int charset, c, found = 0, reg;
2618 /* Table of charsets to be designated to each graphic register. */
2619 int r[4];
2620
2621 for (reg = 0; reg < 4; reg++)
2622 r[reg] = -1;
2623
2624 while (found < 4)
2625 {
2626 ONE_MORE_CHAR (c);
2627 if (c == '\n')
2628 break;
2629
2630 charset = CHAR_CHARSET (c);
2631 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2632 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2633 {
2634 found++;
2635 r[reg] = charset;
2636 }
2637 }
2638
2639 label_end_of_loop:
2640 if (found)
2641 {
2642 for (reg = 0; reg < 4; reg++)
2643 if (r[reg] >= 0
2644 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2645 ENCODE_DESIGNATION (r[reg], reg, coding);
2646 }
2647
2648 return dst;
2649 }
2650
2651 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2652
2653 static void
2654 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2655 struct coding_system *coding;
2656 const unsigned char *source;
2657 unsigned char *destination;
2658 int src_bytes, dst_bytes;
2659 {
2660 const unsigned char *src = source;
2661 const unsigned char *src_end = source + src_bytes;
2662 unsigned char *dst = destination;
2663 unsigned char *dst_end = destination + dst_bytes;
2664 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2665 from DST_END to assure overflow checking is necessary only at the
2666 head of loop. */
2667 unsigned char *adjusted_dst_end = dst_end - 19;
2668 /* SRC_BASE remembers the start position in source in each loop.
2669 The loop will be exited when there's not enough source text to
2670 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2671 there's not enough destination area to produce encoded codes
2672 (within macro EMIT_BYTES). */
2673 const unsigned char *src_base;
2674 int c;
2675 Lisp_Object translation_table;
2676 Lisp_Object safe_chars;
2677
2678 if (coding->flags & CODING_FLAG_ISO_SAFE)
2679 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2680
2681 safe_chars = coding_safe_chars (coding->symbol);
2682
2683 if (NILP (Venable_character_translation))
2684 translation_table = Qnil;
2685 else
2686 {
2687 translation_table = coding->translation_table_for_encode;
2688 if (NILP (translation_table))
2689 translation_table = Vstandard_translation_table_for_encode;
2690 }
2691
2692 coding->consumed_char = 0;
2693 coding->errors = 0;
2694 while (1)
2695 {
2696 src_base = src;
2697
2698 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2699 {
2700 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2701 break;
2702 }
2703
2704 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2705 && CODING_SPEC_ISO_BOL (coding))
2706 {
2707 /* We have to produce designation sequences if any now. */
2708 dst = encode_designation_at_bol (coding, translation_table,
2709 src, src_end, dst);
2710 CODING_SPEC_ISO_BOL (coding) = 0;
2711 }
2712
2713 /* Check composition start and end. */
2714 if (coding->composing != COMPOSITION_DISABLED
2715 && coding->cmp_data_start < coding->cmp_data->used)
2716 {
2717 struct composition_data *cmp_data = coding->cmp_data;
2718 int *data = cmp_data->data + coding->cmp_data_start;
2719 int this_pos = cmp_data->char_offset + coding->consumed_char;
2720
2721 if (coding->composing == COMPOSITION_RELATIVE)
2722 {
2723 if (this_pos == data[2])
2724 {
2725 ENCODE_COMPOSITION_END (coding, data);
2726 cmp_data = coding->cmp_data;
2727 data = cmp_data->data + coding->cmp_data_start;
2728 }
2729 }
2730 else if (COMPOSING_P (coding))
2731 {
2732 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2733 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2734 /* We have consumed components of the composition.
2735 What follows in SRC is the composition's base
2736 text. */
2737 ENCODE_COMPOSITION_FAKE_START (coding);
2738 else
2739 {
2740 int c = cmp_data->data[coding->cmp_data_index++];
2741 if (coding->composition_rule_follows)
2742 {
2743 ENCODE_COMPOSITION_RULE (c);
2744 coding->composition_rule_follows = 0;
2745 }
2746 else
2747 {
2748 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750 ENCODE_UNSAFE_CHARACTER (c);
2751 else
2752 ENCODE_ISO_CHARACTER (c);
2753 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2754 coding->composition_rule_follows = 1;
2755 }
2756 continue;
2757 }
2758 }
2759 if (!COMPOSING_P (coding))
2760 {
2761 if (this_pos == data[1])
2762 {
2763 ENCODE_COMPOSITION_START (coding, data);
2764 continue;
2765 }
2766 }
2767 }
2768
2769 ONE_MORE_CHAR (c);
2770
2771 /* Now encode the character C. */
2772 if (c < 0x20 || c == 0x7F)
2773 {
2774 if (c == '\r')
2775 {
2776 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2777 {
2778 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2779 ENCODE_RESET_PLANE_AND_REGISTER;
2780 *dst++ = c;
2781 continue;
2782 }
2783 /* fall down to treat '\r' as '\n' ... */
2784 c = '\n';
2785 }
2786 if (c == '\n')
2787 {
2788 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2789 ENCODE_RESET_PLANE_AND_REGISTER;
2790 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2791 bcopy (coding->spec.iso2022.initial_designation,
2792 coding->spec.iso2022.current_designation,
2793 sizeof coding->spec.iso2022.initial_designation);
2794 if (coding->eol_type == CODING_EOL_LF
2795 || coding->eol_type == CODING_EOL_UNDECIDED)
2796 *dst++ = ISO_CODE_LF;
2797 else if (coding->eol_type == CODING_EOL_CRLF)
2798 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2799 else
2800 *dst++ = ISO_CODE_CR;
2801 CODING_SPEC_ISO_BOL (coding) = 1;
2802 }
2803 else
2804 {
2805 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2806 ENCODE_RESET_PLANE_AND_REGISTER;
2807 *dst++ = c;
2808 }
2809 }
2810 else if (ASCII_BYTE_P (c))
2811 ENCODE_ISO_CHARACTER (c);
2812 else if (SINGLE_BYTE_CHAR_P (c))
2813 {
2814 *dst++ = c;
2815 coding->errors++;
2816 }
2817 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2818 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2819 ENCODE_UNSAFE_CHARACTER (c);
2820 else
2821 ENCODE_ISO_CHARACTER (c);
2822
2823 coding->consumed_char++;
2824 }
2825
2826 label_end_of_loop:
2827 coding->consumed = src_base - source;
2828 coding->produced = coding->produced_char = dst - destination;
2829 }
2830
2831 \f
2832 /*** 4. SJIS and BIG5 handlers ***/
2833
2834 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2835 quite widely. So, for the moment, Emacs supports them in the bare
2836 C code. But, in the future, they may be supported only by CCL. */
2837
2838 /* SJIS is a coding system encoding three character sets: ASCII, right
2839 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2840 as is. A character of charset katakana-jisx0201 is encoded by
2841 "position-code + 0x80". A character of charset japanese-jisx0208
2842 is encoded in 2-byte but two position-codes are divided and shifted
2843 so that it fits in the range below.
2844
2845 --- CODE RANGE of SJIS ---
2846 (character set) (range)
2847 ASCII 0x00 .. 0x7F
2848 KATAKANA-JISX0201 0xA1 .. 0xDF
2849 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2850 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2851 -------------------------------
2852
2853 */
2854
2855 /* BIG5 is a coding system encoding two character sets: ASCII and
2856 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2857 character set and is encoded in two bytes.
2858
2859 --- CODE RANGE of BIG5 ---
2860 (character set) (range)
2861 ASCII 0x00 .. 0x7F
2862 Big5 (1st byte) 0xA1 .. 0xFE
2863 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2864 --------------------------
2865
2866 Since the number of characters in Big5 is larger than maximum
2867 characters in Emacs' charset (96x96), it can't be handled as one
2868 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2869 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2870 contains frequently used characters and the latter contains less
2871 frequently used characters. */
2872
2873 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2874 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2875 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2876 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2877
2878 /* Number of Big5 characters which have the same code in 1st byte. */
2879 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2880
2881 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2882 do { \
2883 unsigned int temp \
2884 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2885 if (b1 < 0xC9) \
2886 charset = charset_big5_1; \
2887 else \
2888 { \
2889 charset = charset_big5_2; \
2890 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2891 } \
2892 c1 = temp / (0xFF - 0xA1) + 0x21; \
2893 c2 = temp % (0xFF - 0xA1) + 0x21; \
2894 } while (0)
2895
2896 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2897 do { \
2898 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2899 if (charset == charset_big5_2) \
2900 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2901 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2902 b2 = temp % BIG5_SAME_ROW; \
2903 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2904 } while (0)
2905
2906 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2907 Check if a text is encoded in SJIS. If it is, return
2908 CODING_CATEGORY_MASK_SJIS, else return 0. */
2909
2910 static int
2911 detect_coding_sjis (src, src_end, multibytep)
2912 unsigned char *src, *src_end;
2913 int multibytep;
2914 {
2915 int c;
2916 /* Dummy for ONE_MORE_BYTE. */
2917 struct coding_system dummy_coding;
2918 struct coding_system *coding = &dummy_coding;
2919
2920 while (1)
2921 {
2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2923 if (c < 0x80)
2924 continue;
2925 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926 return 0;
2927 if (c <= 0x9F || c >= 0xE0)
2928 {
2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2930 if (c < 0x40 || c == 0x7F || c > 0xFC)
2931 return 0;
2932 }
2933 }
2934 label_end_of_loop:
2935 return CODING_CATEGORY_MASK_SJIS;
2936 }
2937
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939 Check if a text is encoded in BIG5. If it is, return
2940 CODING_CATEGORY_MASK_BIG5, else return 0. */
2941
2942 static int
2943 detect_coding_big5 (src, src_end, multibytep)
2944 unsigned char *src, *src_end;
2945 int multibytep;
2946 {
2947 int c;
2948 /* Dummy for ONE_MORE_BYTE. */
2949 struct coding_system dummy_coding;
2950 struct coding_system *coding = &dummy_coding;
2951
2952 while (1)
2953 {
2954 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2955 if (c < 0x80)
2956 continue;
2957 if (c < 0xA1 || c > 0xFE)
2958 return 0;
2959 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2960 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2961 return 0;
2962 }
2963 label_end_of_loop:
2964 return CODING_CATEGORY_MASK_BIG5;
2965 }
2966
2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2968 Check if a text is encoded in UTF-8. If it is, return
2969 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2970
2971 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2972 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2973 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2974 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2975 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2976 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2977 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2978
2979 static int
2980 detect_coding_utf_8 (src, src_end, multibytep)
2981 unsigned char *src, *src_end;
2982 int multibytep;
2983 {
2984 unsigned char c;
2985 int seq_maybe_bytes;
2986 /* Dummy for ONE_MORE_BYTE. */
2987 struct coding_system dummy_coding;
2988 struct coding_system *coding = &dummy_coding;
2989
2990 while (1)
2991 {
2992 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2993 if (UTF_8_1_OCTET_P (c))
2994 continue;
2995 else if (UTF_8_2_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 1;
2997 else if (UTF_8_3_OCTET_LEADING_P (c))
2998 seq_maybe_bytes = 2;
2999 else if (UTF_8_4_OCTET_LEADING_P (c))
3000 seq_maybe_bytes = 3;
3001 else if (UTF_8_5_OCTET_LEADING_P (c))
3002 seq_maybe_bytes = 4;
3003 else if (UTF_8_6_OCTET_LEADING_P (c))
3004 seq_maybe_bytes = 5;
3005 else
3006 return 0;
3007
3008 do
3009 {
3010 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3011 if (!UTF_8_EXTRA_OCTET_P (c))
3012 return 0;
3013 seq_maybe_bytes--;
3014 }
3015 while (seq_maybe_bytes > 0);
3016 }
3017
3018 label_end_of_loop:
3019 return CODING_CATEGORY_MASK_UTF_8;
3020 }
3021
3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3023 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3024 Little Endian (otherwise). If it is, return
3025 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3026 else return 0. */
3027
3028 #define UTF_16_INVALID_P(val) \
3029 (((val) == 0xFFFE) \
3030 || ((val) == 0xFFFF))
3031
3032 #define UTF_16_HIGH_SURROGATE_P(val) \
3033 (((val) & 0xD800) == 0xD800)
3034
3035 #define UTF_16_LOW_SURROGATE_P(val) \
3036 (((val) & 0xDC00) == 0xDC00)
3037
3038 static int
3039 detect_coding_utf_16 (src, src_end, multibytep)
3040 unsigned char *src, *src_end;
3041 int multibytep;
3042 {
3043 unsigned char c1, c2;
3044 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3045 struct coding_system dummy_coding;
3046 struct coding_system *coding = &dummy_coding;
3047
3048 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3049 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3050
3051 if ((c1 == 0xFF) && (c2 == 0xFE))
3052 return CODING_CATEGORY_MASK_UTF_16_LE;
3053 else if ((c1 == 0xFE) && (c2 == 0xFF))
3054 return CODING_CATEGORY_MASK_UTF_16_BE;
3055
3056 label_end_of_loop:
3057 return 0;
3058 }
3059
3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3061 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3062
3063 static void
3064 decode_coding_sjis_big5 (coding, source, destination,
3065 src_bytes, dst_bytes, sjis_p)
3066 struct coding_system *coding;
3067 const unsigned char *source;
3068 unsigned char *destination;
3069 int src_bytes, dst_bytes;
3070 int sjis_p;
3071 {
3072 const unsigned char *src = source;
3073 const unsigned char *src_end = source + src_bytes;
3074 unsigned char *dst = destination;
3075 unsigned char *dst_end = destination + dst_bytes;
3076 /* SRC_BASE remembers the start position in source in each loop.
3077 The loop will be exited when there's not enough source code
3078 (within macro ONE_MORE_BYTE), or when there's not enough
3079 destination area to produce a character (within macro
3080 EMIT_CHAR). */
3081 const unsigned char *src_base;
3082 Lisp_Object translation_table;
3083
3084 if (NILP (Venable_character_translation))
3085 translation_table = Qnil;
3086 else
3087 {
3088 translation_table = coding->translation_table_for_decode;
3089 if (NILP (translation_table))
3090 translation_table = Vstandard_translation_table_for_decode;
3091 }
3092
3093 coding->produced_char = 0;
3094 while (1)
3095 {
3096 int c, charset, c1, c2 = 0;
3097
3098 src_base = src;
3099 ONE_MORE_BYTE (c1);
3100
3101 if (c1 < 0x80)
3102 {
3103 charset = CHARSET_ASCII;
3104 if (c1 < 0x20)
3105 {
3106 if (c1 == '\r')
3107 {
3108 if (coding->eol_type == CODING_EOL_CRLF)
3109 {
3110 ONE_MORE_BYTE (c2);
3111 if (c2 == '\n')
3112 c1 = c2;
3113 else
3114 /* To process C2 again, SRC is subtracted by 1. */
3115 src--;
3116 }
3117 else if (coding->eol_type == CODING_EOL_CR)
3118 c1 = '\n';
3119 }
3120 else if (c1 == '\n'
3121 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3122 && (coding->eol_type == CODING_EOL_CR
3123 || coding->eol_type == CODING_EOL_CRLF))
3124 {
3125 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3126 goto label_end_of_loop;
3127 }
3128 }
3129 }
3130 else
3131 {
3132 if (sjis_p)
3133 {
3134 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3135 goto label_invalid_code;
3136 if (c1 <= 0x9F || c1 >= 0xE0)
3137 {
3138 /* SJIS -> JISX0208 */
3139 ONE_MORE_BYTE (c2);
3140 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3141 goto label_invalid_code;
3142 DECODE_SJIS (c1, c2, c1, c2);
3143 charset = charset_jisx0208;
3144 }
3145 else
3146 /* SJIS -> JISX0201-Kana */
3147 charset = charset_katakana_jisx0201;
3148 }
3149 else
3150 {
3151 /* BIG5 -> Big5 */
3152 if (c1 < 0xA0 || c1 > 0xFE)
3153 goto label_invalid_code;
3154 ONE_MORE_BYTE (c2);
3155 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3156 goto label_invalid_code;
3157 DECODE_BIG5 (c1, c2, charset, c1, c2);
3158 }
3159 }
3160
3161 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3162 EMIT_CHAR (c);
3163 continue;
3164
3165 label_invalid_code:
3166 coding->errors++;
3167 src = src_base;
3168 c = *src++;
3169 EMIT_CHAR (c);
3170 }
3171
3172 label_end_of_loop:
3173 coding->consumed = coding->consumed_char = src_base - source;
3174 coding->produced = dst - destination;
3175 return;
3176 }
3177
3178 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3179 This function can encode charsets `ascii', `katakana-jisx0201',
3180 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3181 are sure that all these charsets are registered as official charset
3182 (i.e. do not have extended leading-codes). Characters of other
3183 charsets are produced without any encoding. If SJIS_P is 1, encode
3184 SJIS text, else encode BIG5 text. */
3185
3186 static void
3187 encode_coding_sjis_big5 (coding, source, destination,
3188 src_bytes, dst_bytes, sjis_p)
3189 struct coding_system *coding;
3190 unsigned char *source, *destination;
3191 int src_bytes, dst_bytes;
3192 int sjis_p;
3193 {
3194 unsigned char *src = source;
3195 unsigned char *src_end = source + src_bytes;
3196 unsigned char *dst = destination;
3197 unsigned char *dst_end = destination + dst_bytes;
3198 /* SRC_BASE remembers the start position in source in each loop.
3199 The loop will be exited when there's not enough source text to
3200 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3201 there's not enough destination area to produce encoded codes
3202 (within macro EMIT_BYTES). */
3203 unsigned char *src_base;
3204 Lisp_Object translation_table;
3205
3206 if (NILP (Venable_character_translation))
3207 translation_table = Qnil;
3208 else
3209 {
3210 translation_table = coding->translation_table_for_encode;
3211 if (NILP (translation_table))
3212 translation_table = Vstandard_translation_table_for_encode;
3213 }
3214
3215 while (1)
3216 {
3217 int c, charset, c1, c2;
3218
3219 src_base = src;
3220 ONE_MORE_CHAR (c);
3221
3222 /* Now encode the character C. */
3223 if (SINGLE_BYTE_CHAR_P (c))
3224 {
3225 switch (c)
3226 {
3227 case '\r':
3228 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3229 {
3230 EMIT_ONE_BYTE (c);
3231 break;
3232 }
3233 c = '\n';
3234 case '\n':
3235 if (coding->eol_type == CODING_EOL_CRLF)
3236 {
3237 EMIT_TWO_BYTES ('\r', c);
3238 break;
3239 }
3240 else if (coding->eol_type == CODING_EOL_CR)
3241 c = '\r';
3242 default:
3243 EMIT_ONE_BYTE (c);
3244 }
3245 }
3246 else
3247 {
3248 SPLIT_CHAR (c, charset, c1, c2);
3249 if (sjis_p)
3250 {
3251 if (charset == charset_jisx0208
3252 || charset == charset_jisx0208_1978)
3253 {
3254 ENCODE_SJIS (c1, c2, c1, c2);
3255 EMIT_TWO_BYTES (c1, c2);
3256 }
3257 else if (charset == charset_katakana_jisx0201)
3258 EMIT_ONE_BYTE (c1 | 0x80);
3259 else if (charset == charset_latin_jisx0201)
3260 EMIT_ONE_BYTE (c1);
3261 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3262 {
3263 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3264 if (CHARSET_WIDTH (charset) > 1)
3265 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3266 }
3267 else
3268 /* There's no way other than producing the internal
3269 codes as is. */
3270 EMIT_BYTES (src_base, src);
3271 }
3272 else
3273 {
3274 if (charset == charset_big5_1 || charset == charset_big5_2)
3275 {
3276 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3277 EMIT_TWO_BYTES (c1, c2);
3278 }
3279 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3280 {
3281 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3282 if (CHARSET_WIDTH (charset) > 1)
3283 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3284 }
3285 else
3286 /* There's no way other than producing the internal
3287 codes as is. */
3288 EMIT_BYTES (src_base, src);
3289 }
3290 }
3291 coding->consumed_char++;
3292 }
3293
3294 label_end_of_loop:
3295 coding->consumed = src_base - source;
3296 coding->produced = coding->produced_char = dst - destination;
3297 }
3298
3299 \f
3300 /*** 5. CCL handlers ***/
3301
3302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3303 Check if a text is encoded in a coding system of which
3304 encoder/decoder are written in CCL program. If it is, return
3305 CODING_CATEGORY_MASK_CCL, else return 0. */
3306
3307 static int
3308 detect_coding_ccl (src, src_end, multibytep)
3309 unsigned char *src, *src_end;
3310 int multibytep;
3311 {
3312 unsigned char *valid;
3313 int c;
3314 /* Dummy for ONE_MORE_BYTE. */
3315 struct coding_system dummy_coding;
3316 struct coding_system *coding = &dummy_coding;
3317
3318 /* No coding system is assigned to coding-category-ccl. */
3319 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3320 return 0;
3321
3322 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3323 while (1)
3324 {
3325 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3326 if (! valid[c])
3327 return 0;
3328 }
3329 label_end_of_loop:
3330 return CODING_CATEGORY_MASK_CCL;
3331 }
3332
3333 \f
3334 /*** 6. End-of-line handlers ***/
3335
3336 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3337
3338 static void
3339 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3340 struct coding_system *coding;
3341 const unsigned char *source;
3342 unsigned char *destination;
3343 int src_bytes, dst_bytes;
3344 {
3345 const unsigned char *src = source;
3346 unsigned char *dst = destination;
3347 const unsigned char *src_end = src + src_bytes;
3348 unsigned char *dst_end = dst + dst_bytes;
3349 Lisp_Object translation_table;
3350 /* SRC_BASE remembers the start position in source in each loop.
3351 The loop will be exited when there's not enough source code
3352 (within macro ONE_MORE_BYTE), or when there's not enough
3353 destination area to produce a character (within macro
3354 EMIT_CHAR). */
3355 const unsigned char *src_base;
3356 int c;
3357
3358 translation_table = Qnil;
3359 switch (coding->eol_type)
3360 {
3361 case CODING_EOL_CRLF:
3362 while (1)
3363 {
3364 src_base = src;
3365 ONE_MORE_BYTE (c);
3366 if (c == '\r')
3367 {
3368 ONE_MORE_BYTE (c);
3369 if (c != '\n')
3370 {
3371 src--;
3372 c = '\r';
3373 }
3374 }
3375 else if (c == '\n'
3376 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3377 {
3378 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3379 goto label_end_of_loop;
3380 }
3381 EMIT_CHAR (c);
3382 }
3383 break;
3384
3385 case CODING_EOL_CR:
3386 while (1)
3387 {
3388 src_base = src;
3389 ONE_MORE_BYTE (c);
3390 if (c == '\n')
3391 {
3392 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3393 {
3394 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3395 goto label_end_of_loop;
3396 }
3397 }
3398 else if (c == '\r')
3399 c = '\n';
3400 EMIT_CHAR (c);
3401 }
3402 break;
3403
3404 default: /* no need for EOL handling */
3405 while (1)
3406 {
3407 src_base = src;
3408 ONE_MORE_BYTE (c);
3409 EMIT_CHAR (c);
3410 }
3411 }
3412
3413 label_end_of_loop:
3414 coding->consumed = coding->consumed_char = src_base - source;
3415 coding->produced = dst - destination;
3416 return;
3417 }
3418
3419 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3420 format of end-of-line according to `coding->eol_type'. It also
3421 convert multibyte form 8-bit characters to unibyte if
3422 CODING->src_multibyte is nonzero. If `coding->mode &
3423 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3424 also means end-of-line. */
3425
3426 static void
3427 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3428 struct coding_system *coding;
3429 const unsigned char *source;
3430 unsigned char *destination;
3431 int src_bytes, dst_bytes;
3432 {
3433 const unsigned char *src = source;
3434 unsigned char *dst = destination;
3435 const unsigned char *src_end = src + src_bytes;
3436 unsigned char *dst_end = dst + dst_bytes;
3437 Lisp_Object translation_table;
3438 /* SRC_BASE remembers the start position in source in each loop.
3439 The loop will be exited when there's not enough source text to
3440 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3441 there's not enough destination area to produce encoded codes
3442 (within macro EMIT_BYTES). */
3443 const unsigned char *src_base;
3444 unsigned char *tmp;
3445 int c;
3446 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3447
3448 translation_table = Qnil;
3449 if (coding->src_multibyte
3450 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3451 {
3452 src_end--;
3453 src_bytes--;
3454 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3455 }
3456
3457 if (coding->eol_type == CODING_EOL_CRLF)
3458 {
3459 while (src < src_end)
3460 {
3461 src_base = src;
3462 c = *src++;
3463 if (c >= 0x20)
3464 EMIT_ONE_BYTE (c);
3465 else if (c == '\n' || (c == '\r' && selective_display))
3466 EMIT_TWO_BYTES ('\r', '\n');
3467 else
3468 EMIT_ONE_BYTE (c);
3469 }
3470 src_base = src;
3471 label_end_of_loop:
3472 ;
3473 }
3474 else
3475 {
3476 if (!dst_bytes || src_bytes <= dst_bytes)
3477 {
3478 safe_bcopy (src, dst, src_bytes);
3479 src_base = src_end;
3480 dst += src_bytes;
3481 }
3482 else
3483 {
3484 if (coding->src_multibyte
3485 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3486 dst_bytes--;
3487 safe_bcopy (src, dst, dst_bytes);
3488 src_base = src + dst_bytes;
3489 dst = destination + dst_bytes;
3490 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3491 }
3492 if (coding->eol_type == CODING_EOL_CR)
3493 {
3494 for (tmp = destination; tmp < dst; tmp++)
3495 if (*tmp == '\n') *tmp = '\r';
3496 }
3497 else if (selective_display)
3498 {
3499 for (tmp = destination; tmp < dst; tmp++)
3500 if (*tmp == '\r') *tmp = '\n';
3501 }
3502 }
3503 if (coding->src_multibyte)
3504 dst = destination + str_as_unibyte (destination, dst - destination);
3505
3506 coding->consumed = src_base - source;
3507 coding->produced = dst - destination;
3508 coding->produced_char = coding->produced;
3509 }
3510
3511 \f
3512 /*** 7. C library functions ***/
3513
3514 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3515 has a property `coding-system'. The value of this property is a
3516 vector of length 5 (called the coding-vector). Among elements of
3517 this vector, the first (element[0]) and the fifth (element[4])
3518 carry important information for decoding/encoding. Before
3519 decoding/encoding, this information should be set in fields of a
3520 structure of type `coding_system'.
3521
3522 The value of the property `coding-system' can be a symbol of another
3523 subsidiary coding-system. In that case, Emacs gets coding-vector
3524 from that symbol.
3525
3526 `element[0]' contains information to be set in `coding->type'. The
3527 value and its meaning is as follows:
3528
3529 0 -- coding_type_emacs_mule
3530 1 -- coding_type_sjis
3531 2 -- coding_type_iso2022
3532 3 -- coding_type_big5
3533 4 -- coding_type_ccl encoder/decoder written in CCL
3534 nil -- coding_type_no_conversion
3535 t -- coding_type_undecided (automatic conversion on decoding,
3536 no-conversion on encoding)
3537
3538 `element[4]' contains information to be set in `coding->flags' and
3539 `coding->spec'. The meaning varies by `coding->type'.
3540
3541 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3542 of length 32 (of which the first 13 sub-elements are used now).
3543 Meanings of these sub-elements are:
3544
3545 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3546 If the value is an integer of valid charset, the charset is
3547 assumed to be designated to graphic register N initially.
3548
3549 If the value is minus, it is a minus value of charset which
3550 reserves graphic register N, which means that the charset is
3551 not designated initially but should be designated to graphic
3552 register N just before encoding a character in that charset.
3553
3554 If the value is nil, graphic register N is never used on
3555 encoding.
3556
3557 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3558 Each value takes t or nil. See the section ISO2022 of
3559 `coding.h' for more information.
3560
3561 If `coding->type' is `coding_type_big5', element[4] is t to denote
3562 BIG5-ETen or nil to denote BIG5-HKU.
3563
3564 If `coding->type' takes the other value, element[4] is ignored.
3565
3566 Emacs Lisp's coding systems also carry information about format of
3567 end-of-line in a value of property `eol-type'. If the value is
3568 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3569 means CODING_EOL_CR. If it is not integer, it should be a vector
3570 of subsidiary coding systems of which property `eol-type' has one
3571 of the above values.
3572
3573 */
3574
3575 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3576 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3577 is setup so that no conversion is necessary and return -1, else
3578 return 0. */
3579
3580 int
3581 setup_coding_system (coding_system, coding)
3582 Lisp_Object coding_system;
3583 struct coding_system *coding;
3584 {
3585 Lisp_Object coding_spec, coding_type, eol_type, plist;
3586 Lisp_Object val;
3587
3588 /* At first, zero clear all members. */
3589 bzero (coding, sizeof (struct coding_system));
3590
3591 /* Initialize some fields required for all kinds of coding systems. */
3592 coding->symbol = coding_system;
3593 coding->heading_ascii = -1;
3594 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3595 coding->composing = COMPOSITION_DISABLED;
3596 coding->cmp_data = NULL;
3597
3598 if (NILP (coding_system))
3599 goto label_invalid_coding_system;
3600
3601 coding_spec = Fget (coding_system, Qcoding_system);
3602
3603 if (!VECTORP (coding_spec)
3604 || XVECTOR (coding_spec)->size != 5
3605 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3606 goto label_invalid_coding_system;
3607
3608 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3609 if (VECTORP (eol_type))
3610 {
3611 coding->eol_type = CODING_EOL_UNDECIDED;
3612 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3613 if (system_eol_type != CODING_EOL_LF)
3614 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3615 }
3616 else if (XFASTINT (eol_type) == 1)
3617 {
3618 coding->eol_type = CODING_EOL_CRLF;
3619 coding->common_flags
3620 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3621 }
3622 else if (XFASTINT (eol_type) == 2)
3623 {
3624 coding->eol_type = CODING_EOL_CR;
3625 coding->common_flags
3626 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3627 }
3628 else
3629 {
3630 coding->common_flags = 0;
3631 coding->eol_type = CODING_EOL_LF;
3632 }
3633
3634 coding_type = XVECTOR (coding_spec)->contents[0];
3635 /* Try short cut. */
3636 if (SYMBOLP (coding_type))
3637 {
3638 if (EQ (coding_type, Qt))
3639 {
3640 coding->type = coding_type_undecided;
3641 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3642 }
3643 else
3644 coding->type = coding_type_no_conversion;
3645 /* Initialize this member. Any thing other than
3646 CODING_CATEGORY_IDX_UTF_16_BE and
3647 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3648 special treatment in detect_eol. */
3649 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3650
3651 return 0;
3652 }
3653
3654 /* Get values of coding system properties:
3655 `post-read-conversion', `pre-write-conversion',
3656 `translation-table-for-decode', `translation-table-for-encode'. */
3657 plist = XVECTOR (coding_spec)->contents[3];
3658 /* Pre & post conversion functions should be disabled if
3659 inhibit_eol_conversion is nonzero. This is the case that a code
3660 conversion function is called while those functions are running. */
3661 if (! inhibit_pre_post_conversion)
3662 {
3663 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3664 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3665 }
3666 val = Fplist_get (plist, Qtranslation_table_for_decode);
3667 if (SYMBOLP (val))
3668 val = Fget (val, Qtranslation_table_for_decode);
3669 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3670 val = Fplist_get (plist, Qtranslation_table_for_encode);
3671 if (SYMBOLP (val))
3672 val = Fget (val, Qtranslation_table_for_encode);
3673 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3674 val = Fplist_get (plist, Qcoding_category);
3675 if (!NILP (val))
3676 {
3677 val = Fget (val, Qcoding_category_index);
3678 if (INTEGERP (val))
3679 coding->category_idx = XINT (val);
3680 else
3681 goto label_invalid_coding_system;
3682 }
3683 else
3684 goto label_invalid_coding_system;
3685
3686 /* If the coding system has non-nil `composition' property, enable
3687 composition handling. */
3688 val = Fplist_get (plist, Qcomposition);
3689 if (!NILP (val))
3690 coding->composing = COMPOSITION_NO;
3691
3692 /* If the coding system is ascii-incompatible, record it in
3693 common_flags. */
3694 val = Fplist_get (plist, Qascii_incompatible);
3695 if (! NILP (val))
3696 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3697
3698 switch (XFASTINT (coding_type))
3699 {
3700 case 0:
3701 coding->type = coding_type_emacs_mule;
3702 coding->common_flags
3703 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3704 if (!NILP (coding->post_read_conversion))
3705 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3706 if (!NILP (coding->pre_write_conversion))
3707 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3708 break;
3709
3710 case 1:
3711 coding->type = coding_type_sjis;
3712 coding->common_flags
3713 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3714 break;
3715
3716 case 2:
3717 coding->type = coding_type_iso2022;
3718 coding->common_flags
3719 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3720 {
3721 Lisp_Object val, temp;
3722 Lisp_Object *flags;
3723 int i, charset, reg_bits = 0;
3724
3725 val = XVECTOR (coding_spec)->contents[4];
3726
3727 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3728 goto label_invalid_coding_system;
3729
3730 flags = XVECTOR (val)->contents;
3731 coding->flags
3732 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3733 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3734 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3735 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3736 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3737 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3738 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3739 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3740 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3741 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3742 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3743 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3744 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3745 );
3746
3747 /* Invoke graphic register 0 to plane 0. */
3748 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3749 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3750 CODING_SPEC_ISO_INVOCATION (coding, 1)
3751 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3752 /* Not single shifting at first. */
3753 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3754 /* Beginning of buffer should also be regarded as bol. */
3755 CODING_SPEC_ISO_BOL (coding) = 1;
3756
3757 for (charset = 0; charset <= MAX_CHARSET; charset++)
3758 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3759 val = Vcharset_revision_alist;
3760 while (CONSP (val))
3761 {
3762 charset = get_charset_id (Fcar_safe (XCAR (val)));
3763 if (charset >= 0
3764 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3765 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3766 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3767 val = XCDR (val);
3768 }
3769
3770 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3771 FLAGS[REG] can be one of below:
3772 integer CHARSET: CHARSET occupies register I,
3773 t: designate nothing to REG initially, but can be used
3774 by any charsets,
3775 list of integer, nil, or t: designate the first
3776 element (if integer) to REG initially, the remaining
3777 elements (if integer) is designated to REG on request,
3778 if an element is t, REG can be used by any charsets,
3779 nil: REG is never used. */
3780 for (charset = 0; charset <= MAX_CHARSET; charset++)
3781 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3782 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3783 for (i = 0; i < 4; i++)
3784 {
3785 if ((INTEGERP (flags[i])
3786 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3787 || (charset = get_charset_id (flags[i])) >= 0)
3788 {
3789 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3790 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3791 }
3792 else if (EQ (flags[i], Qt))
3793 {
3794 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3795 reg_bits |= 1 << i;
3796 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3797 }
3798 else if (CONSP (flags[i]))
3799 {
3800 Lisp_Object tail;
3801 tail = flags[i];
3802
3803 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3804 if ((INTEGERP (XCAR (tail))
3805 && (charset = XINT (XCAR (tail)),
3806 CHARSET_VALID_P (charset)))
3807 || (charset = get_charset_id (XCAR (tail))) >= 0)
3808 {
3809 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3810 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3811 }
3812 else
3813 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3814 tail = XCDR (tail);
3815 while (CONSP (tail))
3816 {
3817 if ((INTEGERP (XCAR (tail))
3818 && (charset = XINT (XCAR (tail)),
3819 CHARSET_VALID_P (charset)))
3820 || (charset = get_charset_id (XCAR (tail))) >= 0)
3821 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3822 = i;
3823 else if (EQ (XCAR (tail), Qt))
3824 reg_bits |= 1 << i;
3825 tail = XCDR (tail);
3826 }
3827 }
3828 else
3829 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3830
3831 CODING_SPEC_ISO_DESIGNATION (coding, i)
3832 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3833 }
3834
3835 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3836 {
3837 /* REG 1 can be used only by locking shift in 7-bit env. */
3838 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3839 reg_bits &= ~2;
3840 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3841 /* Without any shifting, only REG 0 and 1 can be used. */
3842 reg_bits &= 3;
3843 }
3844
3845 if (reg_bits)
3846 for (charset = 0; charset <= MAX_CHARSET; charset++)
3847 {
3848 if (CHARSET_DEFINED_P (charset)
3849 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3850 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3851 {
3852 /* There exist some default graphic registers to be
3853 used by CHARSET. */
3854
3855 /* We had better avoid designating a charset of
3856 CHARS96 to REG 0 as far as possible. */
3857 if (CHARSET_CHARS (charset) == 96)
3858 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3859 = (reg_bits & 2
3860 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3861 else
3862 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3863 = (reg_bits & 1
3864 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3865 }
3866 }
3867 }
3868 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3869 coding->spec.iso2022.last_invalid_designation_register = -1;
3870 break;
3871
3872 case 3:
3873 coding->type = coding_type_big5;
3874 coding->common_flags
3875 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3876 coding->flags
3877 = (NILP (XVECTOR (coding_spec)->contents[4])
3878 ? CODING_FLAG_BIG5_HKU
3879 : CODING_FLAG_BIG5_ETEN);
3880 break;
3881
3882 case 4:
3883 coding->type = coding_type_ccl;
3884 coding->common_flags
3885 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3886 {
3887 val = XVECTOR (coding_spec)->contents[4];
3888 if (! CONSP (val)
3889 || setup_ccl_program (&(coding->spec.ccl.decoder),
3890 XCAR (val)) < 0
3891 || setup_ccl_program (&(coding->spec.ccl.encoder),
3892 XCDR (val)) < 0)
3893 goto label_invalid_coding_system;
3894
3895 bzero (coding->spec.ccl.valid_codes, 256);
3896 val = Fplist_get (plist, Qvalid_codes);
3897 if (CONSP (val))
3898 {
3899 Lisp_Object this;
3900
3901 for (; CONSP (val); val = XCDR (val))
3902 {
3903 this = XCAR (val);
3904 if (INTEGERP (this)
3905 && XINT (this) >= 0 && XINT (this) < 256)
3906 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3907 else if (CONSP (this)
3908 && INTEGERP (XCAR (this))
3909 && INTEGERP (XCDR (this)))
3910 {
3911 int start = XINT (XCAR (this));
3912 int end = XINT (XCDR (this));
3913
3914 if (start >= 0 && start <= end && end < 256)
3915 while (start <= end)
3916 coding->spec.ccl.valid_codes[start++] = 1;
3917 }
3918 }
3919 }
3920 }
3921 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3922 coding->spec.ccl.cr_carryover = 0;
3923 coding->spec.ccl.eight_bit_carryover[0] = 0;
3924 break;
3925
3926 case 5:
3927 coding->type = coding_type_raw_text;
3928 break;
3929
3930 default:
3931 goto label_invalid_coding_system;
3932 }
3933 return 0;
3934
3935 label_invalid_coding_system:
3936 coding->type = coding_type_no_conversion;
3937 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3938 coding->common_flags = 0;
3939 coding->eol_type = CODING_EOL_UNDECIDED;
3940 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3941 return NILP (coding_system) ? 0 : -1;
3942 }
3943
3944 /* Free memory blocks allocated for storing composition information. */
3945
3946 void
3947 coding_free_composition_data (coding)
3948 struct coding_system *coding;
3949 {
3950 struct composition_data *cmp_data = coding->cmp_data, *next;
3951
3952 if (!cmp_data)
3953 return;
3954 /* Memory blocks are chained. At first, rewind to the first, then,
3955 free blocks one by one. */
3956 while (cmp_data->prev)
3957 cmp_data = cmp_data->prev;
3958 while (cmp_data)
3959 {
3960 next = cmp_data->next;
3961 xfree (cmp_data);
3962 cmp_data = next;
3963 }
3964 coding->cmp_data = NULL;
3965 }
3966
3967 /* Set `char_offset' member of all memory blocks pointed by
3968 coding->cmp_data to POS. */
3969
3970 void
3971 coding_adjust_composition_offset (coding, pos)
3972 struct coding_system *coding;
3973 int pos;
3974 {
3975 struct composition_data *cmp_data;
3976
3977 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3978 cmp_data->char_offset = pos;
3979 }
3980
3981 /* Setup raw-text or one of its subsidiaries in the structure
3982 coding_system CODING according to the already setup value eol_type
3983 in CODING. CODING should be setup for some coding system in
3984 advance. */
3985
3986 void
3987 setup_raw_text_coding_system (coding)
3988 struct coding_system *coding;
3989 {
3990 if (coding->type != coding_type_raw_text)
3991 {
3992 coding->symbol = Qraw_text;
3993 coding->type = coding_type_raw_text;
3994 if (coding->eol_type != CODING_EOL_UNDECIDED)
3995 {
3996 Lisp_Object subsidiaries;
3997 subsidiaries = Fget (Qraw_text, Qeol_type);
3998
3999 if (VECTORP (subsidiaries)
4000 && XVECTOR (subsidiaries)->size == 3)
4001 coding->symbol
4002 = XVECTOR (subsidiaries)->contents[coding->eol_type];
4003 }
4004 setup_coding_system (coding->symbol, coding);
4005 }
4006 return;
4007 }
4008
4009 /* Emacs has a mechanism to automatically detect a coding system if it
4010 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4011 it's impossible to distinguish some coding systems accurately
4012 because they use the same range of codes. So, at first, coding
4013 systems are categorized into 7, those are:
4014
4015 o coding-category-emacs-mule
4016
4017 The category for a coding system which has the same code range
4018 as Emacs' internal format. Assigned the coding-system (Lisp
4019 symbol) `emacs-mule' by default.
4020
4021 o coding-category-sjis
4022
4023 The category for a coding system which has the same code range
4024 as SJIS. Assigned the coding-system (Lisp
4025 symbol) `japanese-shift-jis' by default.
4026
4027 o coding-category-iso-7
4028
4029 The category for a coding system which has the same code range
4030 as ISO2022 of 7-bit environment. This doesn't use any locking
4031 shift and single shift functions. This can encode/decode all
4032 charsets. Assigned the coding-system (Lisp symbol)
4033 `iso-2022-7bit' by default.
4034
4035 o coding-category-iso-7-tight
4036
4037 Same as coding-category-iso-7 except that this can
4038 encode/decode only the specified charsets.
4039
4040 o coding-category-iso-8-1
4041
4042 The category for a coding system which has the same code range
4043 as ISO2022 of 8-bit environment and graphic plane 1 used only
4044 for DIMENSION1 charset. This doesn't use any locking shift
4045 and single shift functions. Assigned the coding-system (Lisp
4046 symbol) `iso-latin-1' by default.
4047
4048 o coding-category-iso-8-2
4049
4050 The category for a coding system which has the same code range
4051 as ISO2022 of 8-bit environment and graphic plane 1 used only
4052 for DIMENSION2 charset. This doesn't use any locking shift
4053 and single shift functions. Assigned the coding-system (Lisp
4054 symbol) `japanese-iso-8bit' by default.
4055
4056 o coding-category-iso-7-else
4057
4058 The category for a coding system which has the same code range
4059 as ISO2022 of 7-bit environment but uses locking shift or
4060 single shift functions. Assigned the coding-system (Lisp
4061 symbol) `iso-2022-7bit-lock' by default.
4062
4063 o coding-category-iso-8-else
4064
4065 The category for a coding system which has the same code range
4066 as ISO2022 of 8-bit environment but uses locking shift or
4067 single shift functions. Assigned the coding-system (Lisp
4068 symbol) `iso-2022-8bit-ss2' by default.
4069
4070 o coding-category-big5
4071
4072 The category for a coding system which has the same code range
4073 as BIG5. Assigned the coding-system (Lisp symbol)
4074 `cn-big5' by default.
4075
4076 o coding-category-utf-8
4077
4078 The category for a coding system which has the same code range
4079 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4080 symbol) `utf-8' by default.
4081
4082 o coding-category-utf-16-be
4083
4084 The category for a coding system in which a text has an
4085 Unicode signature (cf. Unicode Standard) in the order of BIG
4086 endian at the head. Assigned the coding-system (Lisp symbol)
4087 `utf-16-be' by default.
4088
4089 o coding-category-utf-16-le
4090
4091 The category for a coding system in which a text has an
4092 Unicode signature (cf. Unicode Standard) in the order of
4093 LITTLE endian at the head. Assigned the coding-system (Lisp
4094 symbol) `utf-16-le' by default.
4095
4096 o coding-category-ccl
4097
4098 The category for a coding system of which encoder/decoder is
4099 written in CCL programs. The default value is nil, i.e., no
4100 coding system is assigned.
4101
4102 o coding-category-binary
4103
4104 The category for a coding system not categorized in any of the
4105 above. Assigned the coding-system (Lisp symbol)
4106 `no-conversion' by default.
4107
4108 Each of them is a Lisp symbol and the value is an actual
4109 `coding-system' (this is also a Lisp symbol) assigned by a user.
4110 What Emacs does actually is to detect a category of coding system.
4111 Then, it uses a `coding-system' assigned to it. If Emacs can't
4112 decide a single possible category, it selects a category of the
4113 highest priority. Priorities of categories are also specified by a
4114 user in a Lisp variable `coding-category-list'.
4115
4116 */
4117
4118 static
4119 int ascii_skip_code[256];
4120
4121 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4122 If it detects possible coding systems, return an integer in which
4123 appropriate flag bits are set. Flag bits are defined by macros
4124 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4125 it should point the table `coding_priorities'. In that case, only
4126 the flag bit for a coding system of the highest priority is set in
4127 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4128 range 0x80..0x9F are in multibyte form.
4129
4130 How many ASCII characters are at the head is returned as *SKIP. */
4131
4132 static int
4133 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4134 unsigned char *source;
4135 int src_bytes, *priorities, *skip;
4136 int multibytep;
4137 {
4138 register unsigned char c;
4139 unsigned char *src = source, *src_end = source + src_bytes;
4140 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4141 int i;
4142
4143 /* At first, skip all ASCII characters and control characters except
4144 for three ISO2022 specific control characters. */
4145 ascii_skip_code[ISO_CODE_SO] = 0;
4146 ascii_skip_code[ISO_CODE_SI] = 0;
4147 ascii_skip_code[ISO_CODE_ESC] = 0;
4148
4149 label_loop_detect_coding:
4150 while (src < src_end && ascii_skip_code[*src]) src++;
4151 *skip = src - source;
4152
4153 if (src >= src_end)
4154 /* We found nothing other than ASCII. There's nothing to do. */
4155 return 0;
4156
4157 c = *src;
4158 /* The text seems to be encoded in some multilingual coding system.
4159 Now, try to find in which coding system the text is encoded. */
4160 if (c < 0x80)
4161 {
4162 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4163 /* C is an ISO2022 specific control code of C0. */
4164 mask = detect_coding_iso2022 (src, src_end, multibytep);
4165 if (mask == 0)
4166 {
4167 /* No valid ISO2022 code follows C. Try again. */
4168 src++;
4169 if (c == ISO_CODE_ESC)
4170 ascii_skip_code[ISO_CODE_ESC] = 1;
4171 else
4172 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4173 goto label_loop_detect_coding;
4174 }
4175 if (priorities)
4176 {
4177 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4178 {
4179 if (mask & priorities[i])
4180 return priorities[i];
4181 }
4182 return CODING_CATEGORY_MASK_RAW_TEXT;
4183 }
4184 }
4185 else
4186 {
4187 int try;
4188
4189 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4190 c = src[1] - 0x20;
4191
4192 if (c < 0xA0)
4193 {
4194 /* C is the first byte of SJIS character code,
4195 or a leading-code of Emacs' internal format (emacs-mule),
4196 or the first byte of UTF-16. */
4197 try = (CODING_CATEGORY_MASK_SJIS
4198 | CODING_CATEGORY_MASK_EMACS_MULE
4199 | CODING_CATEGORY_MASK_UTF_16_BE
4200 | CODING_CATEGORY_MASK_UTF_16_LE);
4201
4202 /* Or, if C is a special latin extra code,
4203 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4204 or is an ISO2022 control-sequence-introducer (CSI),
4205 we should also consider the possibility of ISO2022 codings. */
4206 if ((VECTORP (Vlatin_extra_code_table)
4207 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4208 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4209 || (c == ISO_CODE_CSI
4210 && (src < src_end
4211 && (*src == ']'
4212 || ((*src == '0' || *src == '1' || *src == '2')
4213 && src + 1 < src_end
4214 && src[1] == ']')))))
4215 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4216 | CODING_CATEGORY_MASK_ISO_8BIT);
4217 }
4218 else
4219 /* C is a character of ISO2022 in graphic plane right,
4220 or a SJIS's 1-byte character code (i.e. JISX0201),
4221 or the first byte of BIG5's 2-byte code,
4222 or the first byte of UTF-8/16. */
4223 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4224 | CODING_CATEGORY_MASK_ISO_8BIT
4225 | CODING_CATEGORY_MASK_SJIS
4226 | CODING_CATEGORY_MASK_BIG5
4227 | CODING_CATEGORY_MASK_UTF_8
4228 | CODING_CATEGORY_MASK_UTF_16_BE
4229 | CODING_CATEGORY_MASK_UTF_16_LE);
4230
4231 /* Or, we may have to consider the possibility of CCL. */
4232 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4233 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4234 ->spec.ccl.valid_codes)[c])
4235 try |= CODING_CATEGORY_MASK_CCL;
4236
4237 mask = 0;
4238 utf16_examined_p = iso2022_examined_p = 0;
4239 if (priorities)
4240 {
4241 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4242 {
4243 if (!iso2022_examined_p
4244 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4245 {
4246 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4247 iso2022_examined_p = 1;
4248 }
4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4250 mask |= detect_coding_sjis (src, src_end, multibytep);
4251 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4252 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4253 else if (!utf16_examined_p
4254 && (priorities[i] & try &
4255 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4256 {
4257 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4258 utf16_examined_p = 1;
4259 }
4260 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4261 mask |= detect_coding_big5 (src, src_end, multibytep);
4262 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4263 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4264 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4265 mask |= detect_coding_ccl (src, src_end, multibytep);
4266 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4267 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4268 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4269 mask |= CODING_CATEGORY_MASK_BINARY;
4270 if (mask & priorities[i])
4271 return priorities[i];
4272 }
4273 return CODING_CATEGORY_MASK_RAW_TEXT;
4274 }
4275 if (try & CODING_CATEGORY_MASK_ISO)
4276 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4277 if (try & CODING_CATEGORY_MASK_SJIS)
4278 mask |= detect_coding_sjis (src, src_end, multibytep);
4279 if (try & CODING_CATEGORY_MASK_BIG5)
4280 mask |= detect_coding_big5 (src, src_end, multibytep);
4281 if (try & CODING_CATEGORY_MASK_UTF_8)
4282 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4283 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4284 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4285 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4286 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4287 if (try & CODING_CATEGORY_MASK_CCL)
4288 mask |= detect_coding_ccl (src, src_end, multibytep);
4289 }
4290 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4291 }
4292
4293 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4294 The information of the detected coding system is set in CODING. */
4295
4296 void
4297 detect_coding (coding, src, src_bytes)
4298 struct coding_system *coding;
4299 const unsigned char *src;
4300 int src_bytes;
4301 {
4302 unsigned int idx;
4303 int skip, mask;
4304 Lisp_Object val;
4305
4306 val = Vcoding_category_list;
4307 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4308 coding->src_multibyte);
4309 coding->heading_ascii = skip;
4310
4311 if (!mask) return;
4312
4313 /* We found a single coding system of the highest priority in MASK. */
4314 idx = 0;
4315 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4316 if (! mask)
4317 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4318
4319 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4320
4321 if (coding->eol_type != CODING_EOL_UNDECIDED)
4322 {
4323 Lisp_Object tmp;
4324
4325 tmp = Fget (val, Qeol_type);
4326 if (VECTORP (tmp))
4327 val = XVECTOR (tmp)->contents[coding->eol_type];
4328 }
4329
4330 /* Setup this new coding system while preserving some slots. */
4331 {
4332 int src_multibyte = coding->src_multibyte;
4333 int dst_multibyte = coding->dst_multibyte;
4334
4335 setup_coding_system (val, coding);
4336 coding->src_multibyte = src_multibyte;
4337 coding->dst_multibyte = dst_multibyte;
4338 coding->heading_ascii = skip;
4339 }
4340 }
4341
4342 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4343 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4344 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4345
4346 How many non-eol characters are at the head is returned as *SKIP. */
4347
4348 #define MAX_EOL_CHECK_COUNT 3
4349
4350 static int
4351 detect_eol_type (source, src_bytes, skip)
4352 unsigned char *source;
4353 int src_bytes, *skip;
4354 {
4355 unsigned char *src = source, *src_end = src + src_bytes;
4356 unsigned char c;
4357 int total = 0; /* How many end-of-lines are found so far. */
4358 int eol_type = CODING_EOL_UNDECIDED;
4359 int this_eol_type;
4360
4361 *skip = 0;
4362
4363 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4364 {
4365 c = *src++;
4366 if (c == '\n' || c == '\r')
4367 {
4368 if (*skip == 0)
4369 *skip = src - 1 - source;
4370 total++;
4371 if (c == '\n')
4372 this_eol_type = CODING_EOL_LF;
4373 else if (src >= src_end || *src != '\n')
4374 this_eol_type = CODING_EOL_CR;
4375 else
4376 this_eol_type = CODING_EOL_CRLF, src++;
4377
4378 if (eol_type == CODING_EOL_UNDECIDED)
4379 /* This is the first end-of-line. */
4380 eol_type = this_eol_type;
4381 else if (eol_type != this_eol_type)
4382 {
4383 /* The found type is different from what found before. */
4384 eol_type = CODING_EOL_INCONSISTENT;
4385 break;
4386 }
4387 }
4388 }
4389
4390 if (*skip == 0)
4391 *skip = src_end - source;
4392 return eol_type;
4393 }
4394
4395 /* Like detect_eol_type, but detect EOL type in 2-octet
4396 big-endian/little-endian format for coding systems utf-16-be and
4397 utf-16-le. */
4398
4399 static int
4400 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4401 unsigned char *source;
4402 int src_bytes, *skip, big_endian_p;
4403 {
4404 unsigned char *src = source, *src_end = src + src_bytes;
4405 unsigned int c1, c2;
4406 int total = 0; /* How many end-of-lines are found so far. */
4407 int eol_type = CODING_EOL_UNDECIDED;
4408 int this_eol_type;
4409 int msb, lsb;
4410
4411 if (big_endian_p)
4412 msb = 0, lsb = 1;
4413 else
4414 msb = 1, lsb = 0;
4415
4416 *skip = 0;
4417
4418 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4419 {
4420 c1 = (src[msb] << 8) | (src[lsb]);
4421 src += 2;
4422
4423 if (c1 == '\n' || c1 == '\r')
4424 {
4425 if (*skip == 0)
4426 *skip = src - 2 - source;
4427 total++;
4428 if (c1 == '\n')
4429 {
4430 this_eol_type = CODING_EOL_LF;
4431 }
4432 else
4433 {
4434 if ((src + 1) >= src_end)
4435 {
4436 this_eol_type = CODING_EOL_CR;
4437 }
4438 else
4439 {
4440 c2 = (src[msb] << 8) | (src[lsb]);
4441 if (c2 == '\n')
4442 this_eol_type = CODING_EOL_CRLF, src += 2;
4443 else
4444 this_eol_type = CODING_EOL_CR;
4445 }
4446 }
4447
4448 if (eol_type == CODING_EOL_UNDECIDED)
4449 /* This is the first end-of-line. */
4450 eol_type = this_eol_type;
4451 else if (eol_type != this_eol_type)
4452 {
4453 /* The found type is different from what found before. */
4454 eol_type = CODING_EOL_INCONSISTENT;
4455 break;
4456 }
4457 }
4458 }
4459
4460 if (*skip == 0)
4461 *skip = src_end - source;
4462 return eol_type;
4463 }
4464
4465 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4466 is encoded. If it detects an appropriate format of end-of-line, it
4467 sets the information in *CODING. */
4468
4469 void
4470 detect_eol (coding, src, src_bytes)
4471 struct coding_system *coding;
4472 const unsigned char *src;
4473 int src_bytes;
4474 {
4475 Lisp_Object val;
4476 int skip;
4477 int eol_type;
4478
4479 switch (coding->category_idx)
4480 {
4481 case CODING_CATEGORY_IDX_UTF_16_BE:
4482 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4483 break;
4484 case CODING_CATEGORY_IDX_UTF_16_LE:
4485 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4486 break;
4487 default:
4488 eol_type = detect_eol_type (src, src_bytes, &skip);
4489 break;
4490 }
4491
4492 if (coding->heading_ascii > skip)
4493 coding->heading_ascii = skip;
4494 else
4495 skip = coding->heading_ascii;
4496
4497 if (eol_type == CODING_EOL_UNDECIDED)
4498 return;
4499 if (eol_type == CODING_EOL_INCONSISTENT)
4500 {
4501 #if 0
4502 /* This code is suppressed until we find a better way to
4503 distinguish raw text file and binary file. */
4504
4505 /* If we have already detected that the coding is raw-text, the
4506 coding should actually be no-conversion. */
4507 if (coding->type == coding_type_raw_text)
4508 {
4509 setup_coding_system (Qno_conversion, coding);
4510 return;
4511 }
4512 /* Else, let's decode only text code anyway. */
4513 #endif /* 0 */
4514 eol_type = CODING_EOL_LF;
4515 }
4516
4517 val = Fget (coding->symbol, Qeol_type);
4518 if (VECTORP (val) && XVECTOR (val)->size == 3)
4519 {
4520 int src_multibyte = coding->src_multibyte;
4521 int dst_multibyte = coding->dst_multibyte;
4522 struct composition_data *cmp_data = coding->cmp_data;
4523
4524 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4525 coding->src_multibyte = src_multibyte;
4526 coding->dst_multibyte = dst_multibyte;
4527 coding->heading_ascii = skip;
4528 coding->cmp_data = cmp_data;
4529 }
4530 }
4531
4532 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4533
4534 #define DECODING_BUFFER_MAG(coding) \
4535 (coding->type == coding_type_iso2022 \
4536 ? 3 \
4537 : (coding->type == coding_type_ccl \
4538 ? coding->spec.ccl.decoder.buf_magnification \
4539 : 2))
4540
4541 /* Return maximum size (bytes) of a buffer enough for decoding
4542 SRC_BYTES of text encoded in CODING. */
4543
4544 int
4545 decoding_buffer_size (coding, src_bytes)
4546 struct coding_system *coding;
4547 int src_bytes;
4548 {
4549 return (src_bytes * DECODING_BUFFER_MAG (coding)
4550 + CONVERSION_BUFFER_EXTRA_ROOM);
4551 }
4552
4553 /* Return maximum size (bytes) of a buffer enough for encoding
4554 SRC_BYTES of text to CODING. */
4555
4556 int
4557 encoding_buffer_size (coding, src_bytes)
4558 struct coding_system *coding;
4559 int src_bytes;
4560 {
4561 int magnification;
4562
4563 if (coding->type == coding_type_ccl)
4564 {
4565 magnification = coding->spec.ccl.encoder.buf_magnification;
4566 if (coding->eol_type == CODING_EOL_CRLF)
4567 magnification *= 2;
4568 }
4569 else if (CODING_REQUIRE_ENCODING (coding))
4570 magnification = 3;
4571 else
4572 magnification = 1;
4573
4574 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4575 }
4576
4577 /* Working buffer for code conversion. */
4578 struct conversion_buffer
4579 {
4580 int size; /* size of data. */
4581 int on_stack; /* 1 if allocated by alloca. */
4582 unsigned char *data;
4583 };
4584
4585 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4586 #define allocate_conversion_buffer(buf, len) \
4587 do { \
4588 if (len < MAX_ALLOCA) \
4589 { \
4590 buf.data = (unsigned char *) alloca (len); \
4591 buf.on_stack = 1; \
4592 } \
4593 else \
4594 { \
4595 buf.data = (unsigned char *) xmalloc (len); \
4596 buf.on_stack = 0; \
4597 } \
4598 buf.size = len; \
4599 } while (0)
4600
4601 /* Double the allocated memory for *BUF. */
4602 static void
4603 extend_conversion_buffer (buf)
4604 struct conversion_buffer *buf;
4605 {
4606 if (buf->on_stack)
4607 {
4608 unsigned char *save = buf->data;
4609 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4610 bcopy (save, buf->data, buf->size);
4611 buf->on_stack = 0;
4612 }
4613 else
4614 {
4615 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4616 }
4617 buf->size *= 2;
4618 }
4619
4620 /* Free the allocated memory for BUF if it is not on stack. */
4621 static void
4622 free_conversion_buffer (buf)
4623 struct conversion_buffer *buf;
4624 {
4625 if (!buf->on_stack)
4626 xfree (buf->data);
4627 }
4628
4629 int
4630 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4631 struct coding_system *coding;
4632 unsigned char *source, *destination;
4633 int src_bytes, dst_bytes, encodep;
4634 {
4635 struct ccl_program *ccl
4636 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4637 unsigned char *dst = destination;
4638
4639 ccl->suppress_error = coding->suppress_error;
4640 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4641 if (encodep)
4642 {
4643 /* On encoding, EOL format is converted within ccl_driver. For
4644 that, setup proper information in the structure CCL. */
4645 ccl->eol_type = coding->eol_type;
4646 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4647 ccl->eol_type = CODING_EOL_LF;
4648 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4649 ccl->eight_bit_control = coding->dst_multibyte;
4650 }
4651 else
4652 ccl->eight_bit_control = 1;
4653 ccl->multibyte = coding->src_multibyte;
4654 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4655 {
4656 /* Move carryover bytes to DESTINATION. */
4657 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4658 while (*p)
4659 *dst++ = *p++;
4660 coding->spec.ccl.eight_bit_carryover[0] = 0;
4661 if (dst_bytes)
4662 dst_bytes -= dst - destination;
4663 }
4664
4665 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4666 &(coding->consumed))
4667 + dst - destination);
4668
4669 if (encodep)
4670 {
4671 coding->produced_char = coding->produced;
4672 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4673 }
4674 else if (!ccl->eight_bit_control)
4675 {
4676 /* The produced bytes forms a valid multibyte sequence. */
4677 coding->produced_char
4678 = multibyte_chars_in_text (destination, coding->produced);
4679 coding->spec.ccl.eight_bit_carryover[0] = 0;
4680 }
4681 else
4682 {
4683 /* On decoding, the destination should always multibyte. But,
4684 CCL program might have been generated an invalid multibyte
4685 sequence. Here we make such a sequence valid as
4686 multibyte. */
4687 int bytes
4688 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4689
4690 if ((coding->consumed < src_bytes
4691 || !ccl->last_block)
4692 && coding->produced >= 1
4693 && destination[coding->produced - 1] >= 0x80)
4694 {
4695 /* We should not convert the tailing 8-bit codes to
4696 multibyte form even if they doesn't form a valid
4697 multibyte sequence. They may form a valid sequence in
4698 the next call. */
4699 int carryover = 0;
4700
4701 if (destination[coding->produced - 1] < 0xA0)
4702 carryover = 1;
4703 else if (coding->produced >= 2)
4704 {
4705 if (destination[coding->produced - 2] >= 0x80)
4706 {
4707 if (destination[coding->produced - 2] < 0xA0)
4708 carryover = 2;
4709 else if (coding->produced >= 3
4710 && destination[coding->produced - 3] >= 0x80
4711 && destination[coding->produced - 3] < 0xA0)
4712 carryover = 3;
4713 }
4714 }
4715 if (carryover > 0)
4716 {
4717 BCOPY_SHORT (destination + coding->produced - carryover,
4718 coding->spec.ccl.eight_bit_carryover,
4719 carryover);
4720 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4721 coding->produced -= carryover;
4722 }
4723 }
4724 coding->produced = str_as_multibyte (destination, bytes,
4725 coding->produced,
4726 &(coding->produced_char));
4727 }
4728
4729 switch (ccl->status)
4730 {
4731 case CCL_STAT_SUSPEND_BY_SRC:
4732 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4733 break;
4734 case CCL_STAT_SUSPEND_BY_DST:
4735 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4736 break;
4737 case CCL_STAT_QUIT:
4738 case CCL_STAT_INVALID_CMD:
4739 coding->result = CODING_FINISH_INTERRUPT;
4740 break;
4741 default:
4742 coding->result = CODING_FINISH_NORMAL;
4743 break;
4744 }
4745 return coding->result;
4746 }
4747
4748 /* Decode EOL format of the text at PTR of BYTES length destructively
4749 according to CODING->eol_type. This is called after the CCL
4750 program produced a decoded text at PTR. If we do CRLF->LF
4751 conversion, update CODING->produced and CODING->produced_char. */
4752
4753 static void
4754 decode_eol_post_ccl (coding, ptr, bytes)
4755 struct coding_system *coding;
4756 unsigned char *ptr;
4757 int bytes;
4758 {
4759 Lisp_Object val, saved_coding_symbol;
4760 unsigned char *pend = ptr + bytes;
4761 int dummy;
4762
4763 /* Remember the current coding system symbol. We set it back when
4764 an inconsistent EOL is found so that `last-coding-system-used' is
4765 set to the coding system that doesn't specify EOL conversion. */
4766 saved_coding_symbol = coding->symbol;
4767
4768 coding->spec.ccl.cr_carryover = 0;
4769 if (coding->eol_type == CODING_EOL_UNDECIDED)
4770 {
4771 /* Here, to avoid the call of setup_coding_system, we directly
4772 call detect_eol_type. */
4773 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4774 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4775 coding->eol_type = CODING_EOL_LF;
4776 if (coding->eol_type != CODING_EOL_UNDECIDED)
4777 {
4778 val = Fget (coding->symbol, Qeol_type);
4779 if (VECTORP (val) && XVECTOR (val)->size == 3)
4780 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4781 }
4782 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4783 }
4784
4785 if (coding->eol_type == CODING_EOL_LF
4786 || coding->eol_type == CODING_EOL_UNDECIDED)
4787 {
4788 /* We have nothing to do. */
4789 ptr = pend;
4790 }
4791 else if (coding->eol_type == CODING_EOL_CRLF)
4792 {
4793 unsigned char *pstart = ptr, *p = ptr;
4794
4795 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4796 && *(pend - 1) == '\r')
4797 {
4798 /* If the last character is CR, we can't handle it here
4799 because LF will be in the not-yet-decoded source text.
4800 Record that the CR is not yet processed. */
4801 coding->spec.ccl.cr_carryover = 1;
4802 coding->produced--;
4803 coding->produced_char--;
4804 pend--;
4805 }
4806 while (ptr < pend)
4807 {
4808 if (*ptr == '\r')
4809 {
4810 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4811 {
4812 *p++ = '\n';
4813 ptr += 2;
4814 }
4815 else
4816 {
4817 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4818 goto undo_eol_conversion;
4819 *p++ = *ptr++;
4820 }
4821 }
4822 else if (*ptr == '\n'
4823 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4824 goto undo_eol_conversion;
4825 else
4826 *p++ = *ptr++;
4827 continue;
4828
4829 undo_eol_conversion:
4830 /* We have faced with inconsistent EOL format at PTR.
4831 Convert all LFs before PTR back to CRLFs. */
4832 for (p--, ptr--; p >= pstart; p--)
4833 {
4834 if (*p == '\n')
4835 *ptr-- = '\n', *ptr-- = '\r';
4836 else
4837 *ptr-- = *p;
4838 }
4839 /* If carryover is recorded, cancel it because we don't
4840 convert CRLF anymore. */
4841 if (coding->spec.ccl.cr_carryover)
4842 {
4843 coding->spec.ccl.cr_carryover = 0;
4844 coding->produced++;
4845 coding->produced_char++;
4846 pend++;
4847 }
4848 p = ptr = pend;
4849 coding->eol_type = CODING_EOL_LF;
4850 coding->symbol = saved_coding_symbol;
4851 }
4852 if (p < pend)
4853 {
4854 /* As each two-byte sequence CRLF was converted to LF, (PEND
4855 - P) is the number of deleted characters. */
4856 coding->produced -= pend - p;
4857 coding->produced_char -= pend - p;
4858 }
4859 }
4860 else /* i.e. coding->eol_type == CODING_EOL_CR */
4861 {
4862 unsigned char *p = ptr;
4863
4864 for (; ptr < pend; ptr++)
4865 {
4866 if (*ptr == '\r')
4867 *ptr = '\n';
4868 else if (*ptr == '\n'
4869 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4870 {
4871 for (; p < ptr; p++)
4872 {
4873 if (*p == '\n')
4874 *p = '\r';
4875 }
4876 ptr = pend;
4877 coding->eol_type = CODING_EOL_LF;
4878 coding->symbol = saved_coding_symbol;
4879 }
4880 }
4881 }
4882 }
4883
4884 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4885 decoding, it may detect coding system and format of end-of-line if
4886 those are not yet decided. The source should be unibyte, the
4887 result is multibyte if CODING->dst_multibyte is nonzero, else
4888 unibyte. */
4889
4890 int
4891 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4892 struct coding_system *coding;
4893 const unsigned char *source;
4894 unsigned char *destination;
4895 int src_bytes, dst_bytes;
4896 {
4897 int extra = 0;
4898
4899 if (coding->type == coding_type_undecided)
4900 detect_coding (coding, source, src_bytes);
4901
4902 if (coding->eol_type == CODING_EOL_UNDECIDED
4903 && coding->type != coding_type_ccl)
4904 {
4905 detect_eol (coding, source, src_bytes);
4906 /* We had better recover the original eol format if we
4907 encounter an inconsistent eol format while decoding. */
4908 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4909 }
4910
4911 coding->produced = coding->produced_char = 0;
4912 coding->consumed = coding->consumed_char = 0;
4913 coding->errors = 0;
4914 coding->result = CODING_FINISH_NORMAL;
4915
4916 switch (coding->type)
4917 {
4918 case coding_type_sjis:
4919 decode_coding_sjis_big5 (coding, source, destination,
4920 src_bytes, dst_bytes, 1);
4921 break;
4922
4923 case coding_type_iso2022:
4924 decode_coding_iso2022 (coding, source, destination,
4925 src_bytes, dst_bytes);
4926 break;
4927
4928 case coding_type_big5:
4929 decode_coding_sjis_big5 (coding, source, destination,
4930 src_bytes, dst_bytes, 0);
4931 break;
4932
4933 case coding_type_emacs_mule:
4934 decode_coding_emacs_mule (coding, source, destination,
4935 src_bytes, dst_bytes);
4936 break;
4937
4938 case coding_type_ccl:
4939 if (coding->spec.ccl.cr_carryover)
4940 {
4941 /* Put the CR which was not processed by the previous call
4942 of decode_eol_post_ccl in DESTINATION. It will be
4943 decoded together with the following LF by the call to
4944 decode_eol_post_ccl below. */
4945 *destination = '\r';
4946 coding->produced++;
4947 coding->produced_char++;
4948 dst_bytes--;
4949 extra = coding->spec.ccl.cr_carryover;
4950 }
4951 ccl_coding_driver (coding, source, destination + extra,
4952 src_bytes, dst_bytes, 0);
4953 if (coding->eol_type != CODING_EOL_LF)
4954 {
4955 coding->produced += extra;
4956 coding->produced_char += extra;
4957 decode_eol_post_ccl (coding, destination, coding->produced);
4958 }
4959 break;
4960
4961 default:
4962 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4963 }
4964
4965 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4966 && coding->mode & CODING_MODE_LAST_BLOCK
4967 && coding->consumed == src_bytes)
4968 coding->result = CODING_FINISH_NORMAL;
4969
4970 if (coding->mode & CODING_MODE_LAST_BLOCK
4971 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4972 {
4973 const unsigned char *src = source + coding->consumed;
4974 unsigned char *dst = destination + coding->produced;
4975
4976 src_bytes -= coding->consumed;
4977 coding->errors++;
4978 if (COMPOSING_P (coding))
4979 DECODE_COMPOSITION_END ('1');
4980 while (src_bytes--)
4981 {
4982 int c = *src++;
4983 dst += CHAR_STRING (c, dst);
4984 coding->produced_char++;
4985 }
4986 coding->consumed = coding->consumed_char = src - source;
4987 coding->produced = dst - destination;
4988 coding->result = CODING_FINISH_NORMAL;
4989 }
4990
4991 if (!coding->dst_multibyte)
4992 {
4993 coding->produced = str_as_unibyte (destination, coding->produced);
4994 coding->produced_char = coding->produced;
4995 }
4996
4997 return coding->result;
4998 }
4999
5000 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
5001 multibyteness of the source is CODING->src_multibyte, the
5002 multibyteness of the result is always unibyte. */
5003
5004 int
5005 encode_coding (coding, source, destination, src_bytes, dst_bytes)
5006 struct coding_system *coding;
5007 const unsigned char *source;
5008 unsigned char *destination;
5009 int src_bytes, dst_bytes;
5010 {
5011 coding->produced = coding->produced_char = 0;
5012 coding->consumed = coding->consumed_char = 0;
5013 coding->errors = 0;
5014 coding->result = CODING_FINISH_NORMAL;
5015 if (coding->eol_type == CODING_EOL_UNDECIDED)
5016 coding->eol_type = CODING_EOL_LF;
5017
5018 switch (coding->type)
5019 {
5020 case coding_type_sjis:
5021 encode_coding_sjis_big5 (coding, source, destination,
5022 src_bytes, dst_bytes, 1);
5023 break;
5024
5025 case coding_type_iso2022:
5026 encode_coding_iso2022 (coding, source, destination,
5027 src_bytes, dst_bytes);
5028 break;
5029
5030 case coding_type_big5:
5031 encode_coding_sjis_big5 (coding, source, destination,
5032 src_bytes, dst_bytes, 0);
5033 break;
5034
5035 case coding_type_emacs_mule:
5036 encode_coding_emacs_mule (coding, source, destination,
5037 src_bytes, dst_bytes);
5038 break;
5039
5040 case coding_type_ccl:
5041 ccl_coding_driver (coding, source, destination,
5042 src_bytes, dst_bytes, 1);
5043 break;
5044
5045 default:
5046 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5047 }
5048
5049 if (coding->mode & CODING_MODE_LAST_BLOCK
5050 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5051 {
5052 const unsigned char *src = source + coding->consumed;
5053 unsigned char *dst = destination + coding->produced;
5054
5055 if (coding->type == coding_type_iso2022)
5056 ENCODE_RESET_PLANE_AND_REGISTER;
5057 if (COMPOSING_P (coding))
5058 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5059 if (coding->consumed < src_bytes)
5060 {
5061 int len = src_bytes - coding->consumed;
5062
5063 BCOPY_SHORT (src, dst, len);
5064 if (coding->src_multibyte)
5065 len = str_as_unibyte (dst, len);
5066 dst += len;
5067 coding->consumed = src_bytes;
5068 }
5069 coding->produced = coding->produced_char = dst - destination;
5070 coding->result = CODING_FINISH_NORMAL;
5071 }
5072
5073 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5074 && coding->consumed == src_bytes)
5075 coding->result = CODING_FINISH_NORMAL;
5076
5077 return coding->result;
5078 }
5079
5080 /* Scan text in the region between *BEG and *END (byte positions),
5081 skip characters which we don't have to decode by coding system
5082 CODING at the head and tail, then set *BEG and *END to the region
5083 of the text we actually have to convert. The caller should move
5084 the gap out of the region in advance if the region is from a
5085 buffer.
5086
5087 If STR is not NULL, *BEG and *END are indices into STR. */
5088
5089 static void
5090 shrink_decoding_region (beg, end, coding, str)
5091 int *beg, *end;
5092 struct coding_system *coding;
5093 unsigned char *str;
5094 {
5095 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5096 int eol_conversion;
5097 Lisp_Object translation_table;
5098
5099 if (coding->type == coding_type_ccl
5100 || coding->type == coding_type_undecided
5101 || coding->eol_type != CODING_EOL_LF
5102 || !NILP (coding->post_read_conversion)
5103 || coding->composing != COMPOSITION_DISABLED)
5104 {
5105 /* We can't skip any data. */
5106 return;
5107 }
5108 if (coding->type == coding_type_no_conversion
5109 || coding->type == coding_type_raw_text
5110 || coding->type == coding_type_emacs_mule)
5111 {
5112 /* We need no conversion, but don't have to skip any data here.
5113 Decoding routine handles them effectively anyway. */
5114 return;
5115 }
5116
5117 translation_table = coding->translation_table_for_decode;
5118 if (NILP (translation_table) && !NILP (Venable_character_translation))
5119 translation_table = Vstandard_translation_table_for_decode;
5120 if (CHAR_TABLE_P (translation_table))
5121 {
5122 int i;
5123 for (i = 0; i < 128; i++)
5124 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5125 break;
5126 if (i < 128)
5127 /* Some ASCII character should be translated. We give up
5128 shrinking. */
5129 return;
5130 }
5131
5132 if (coding->heading_ascii >= 0)
5133 /* Detection routine has already found how much we can skip at the
5134 head. */
5135 *beg += coding->heading_ascii;
5136
5137 if (str)
5138 {
5139 begp_orig = begp = str + *beg;
5140 endp_orig = endp = str + *end;
5141 }
5142 else
5143 {
5144 begp_orig = begp = BYTE_POS_ADDR (*beg);
5145 endp_orig = endp = begp + *end - *beg;
5146 }
5147
5148 eol_conversion = (coding->eol_type == CODING_EOL_CR
5149 || coding->eol_type == CODING_EOL_CRLF);
5150
5151 switch (coding->type)
5152 {
5153 case coding_type_sjis:
5154 case coding_type_big5:
5155 /* We can skip all ASCII characters at the head. */
5156 if (coding->heading_ascii < 0)
5157 {
5158 if (eol_conversion)
5159 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5160 else
5161 while (begp < endp && *begp < 0x80) begp++;
5162 }
5163 /* We can skip all ASCII characters at the tail except for the
5164 second byte of SJIS or BIG5 code. */
5165 if (eol_conversion)
5166 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5167 else
5168 while (begp < endp && endp[-1] < 0x80) endp--;
5169 /* Do not consider LF as ascii if preceded by CR, since that
5170 confuses eol decoding. */
5171 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5172 endp++;
5173 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5174 endp++;
5175 break;
5176
5177 case coding_type_iso2022:
5178 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5179 /* We can't skip any data. */
5180 break;
5181 if (coding->heading_ascii < 0)
5182 {
5183 /* We can skip all ASCII characters at the head except for a
5184 few control codes. */
5185 while (begp < endp && (c = *begp) < 0x80
5186 && c != ISO_CODE_CR && c != ISO_CODE_SO
5187 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5188 && (!eol_conversion || c != ISO_CODE_LF))
5189 begp++;
5190 }
5191 switch (coding->category_idx)
5192 {
5193 case CODING_CATEGORY_IDX_ISO_8_1:
5194 case CODING_CATEGORY_IDX_ISO_8_2:
5195 /* We can skip all ASCII characters at the tail. */
5196 if (eol_conversion)
5197 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5198 else
5199 while (begp < endp && endp[-1] < 0x80) endp--;
5200 /* Do not consider LF as ascii if preceded by CR, since that
5201 confuses eol decoding. */
5202 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5203 endp++;
5204 break;
5205
5206 case CODING_CATEGORY_IDX_ISO_7:
5207 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5208 {
5209 /* We can skip all characters at the tail except for 8-bit
5210 codes and ESC and the following 2-byte at the tail. */
5211 unsigned char *eight_bit = NULL;
5212
5213 if (eol_conversion)
5214 while (begp < endp
5215 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5216 {
5217 if (!eight_bit && c & 0x80) eight_bit = endp;
5218 endp--;
5219 }
5220 else
5221 while (begp < endp
5222 && (c = endp[-1]) != ISO_CODE_ESC)
5223 {
5224 if (!eight_bit && c & 0x80) eight_bit = endp;
5225 endp--;
5226 }
5227 /* Do not consider LF as ascii if preceded by CR, since that
5228 confuses eol decoding. */
5229 if (begp < endp && endp < endp_orig
5230 && endp[-1] == '\r' && endp[0] == '\n')
5231 endp++;
5232 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5233 {
5234 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5235 /* This is an ASCII designation sequence. We can
5236 surely skip the tail. But, if we have
5237 encountered an 8-bit code, skip only the codes
5238 after that. */
5239 endp = eight_bit ? eight_bit : endp + 2;
5240 else
5241 /* Hmmm, we can't skip the tail. */
5242 endp = endp_orig;
5243 }
5244 else if (eight_bit)
5245 endp = eight_bit;
5246 }
5247 }
5248 break;
5249
5250 default:
5251 abort ();
5252 }
5253 *beg += begp - begp_orig;
5254 *end += endp - endp_orig;
5255 return;
5256 }
5257
5258 /* Like shrink_decoding_region but for encoding. */
5259
5260 static void
5261 shrink_encoding_region (beg, end, coding, str)
5262 int *beg, *end;
5263 struct coding_system *coding;
5264 unsigned char *str;
5265 {
5266 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5267 int eol_conversion;
5268 Lisp_Object translation_table;
5269
5270 if (coding->type == coding_type_ccl
5271 || coding->eol_type == CODING_EOL_CRLF
5272 || coding->eol_type == CODING_EOL_CR
5273 || (coding->cmp_data && coding->cmp_data->used > 0))
5274 {
5275 /* We can't skip any data. */
5276 return;
5277 }
5278 if (coding->type == coding_type_no_conversion
5279 || coding->type == coding_type_raw_text
5280 || coding->type == coding_type_emacs_mule
5281 || coding->type == coding_type_undecided)
5282 {
5283 /* We need no conversion, but don't have to skip any data here.
5284 Encoding routine handles them effectively anyway. */
5285 return;
5286 }
5287
5288 translation_table = coding->translation_table_for_encode;
5289 if (NILP (translation_table) && !NILP (Venable_character_translation))
5290 translation_table = Vstandard_translation_table_for_encode;
5291 if (CHAR_TABLE_P (translation_table))
5292 {
5293 int i;
5294 for (i = 0; i < 128; i++)
5295 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5296 break;
5297 if (i < 128)
5298 /* Some ASCII character should be translated. We give up
5299 shrinking. */
5300 return;
5301 }
5302
5303 if (str)
5304 {
5305 begp_orig = begp = str + *beg;
5306 endp_orig = endp = str + *end;
5307 }
5308 else
5309 {
5310 begp_orig = begp = BYTE_POS_ADDR (*beg);
5311 endp_orig = endp = begp + *end - *beg;
5312 }
5313
5314 eol_conversion = (coding->eol_type == CODING_EOL_CR
5315 || coding->eol_type == CODING_EOL_CRLF);
5316
5317 /* Here, we don't have to check coding->pre_write_conversion because
5318 the caller is expected to have handled it already. */
5319 switch (coding->type)
5320 {
5321 case coding_type_iso2022:
5322 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5323 /* We can't skip any data. */
5324 break;
5325 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5326 {
5327 unsigned char *bol = begp;
5328 while (begp < endp && *begp < 0x80)
5329 {
5330 begp++;
5331 if (begp[-1] == '\n')
5332 bol = begp;
5333 }
5334 begp = bol;
5335 goto label_skip_tail;
5336 }
5337 /* fall down ... */
5338
5339 case coding_type_sjis:
5340 case coding_type_big5:
5341 /* We can skip all ASCII characters at the head and tail. */
5342 if (eol_conversion)
5343 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5344 else
5345 while (begp < endp && *begp < 0x80) begp++;
5346 label_skip_tail:
5347 if (eol_conversion)
5348 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5349 else
5350 while (begp < endp && *(endp - 1) < 0x80) endp--;
5351 break;
5352
5353 default:
5354 abort ();
5355 }
5356
5357 *beg += begp - begp_orig;
5358 *end += endp - endp_orig;
5359 return;
5360 }
5361
5362 /* As shrinking conversion region requires some overhead, we don't try
5363 shrinking if the length of conversion region is less than this
5364 value. */
5365 static int shrink_conversion_region_threshhold = 1024;
5366
5367 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5368 do { \
5369 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5370 { \
5371 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5372 else shrink_decoding_region (beg, end, coding, str); \
5373 } \
5374 } while (0)
5375
5376 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5377 Vlast_coding_system_used and the remaining elements are buffers to
5378 kill. */
5379 static Lisp_Object
5380 code_convert_region_unwind (arg)
5381 Lisp_Object arg;
5382 {
5383 struct gcpro gcpro1;
5384 GCPRO1 (arg);
5385
5386 inhibit_pre_post_conversion = 0;
5387 Vlast_coding_system_used = XCAR (arg);
5388 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5389 Fkill_buffer (XCAR (arg));
5390
5391 UNGCPRO;
5392 return Qnil;
5393 }
5394
5395 /* Store information about all compositions in the range FROM and TO
5396 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5397 buffer or a string, defaults to the current buffer. */
5398
5399 void
5400 coding_save_composition (coding, from, to, obj)
5401 struct coding_system *coding;
5402 int from, to;
5403 Lisp_Object obj;
5404 {
5405 Lisp_Object prop;
5406 int start, end;
5407
5408 if (coding->composing == COMPOSITION_DISABLED)
5409 return;
5410 if (!coding->cmp_data)
5411 coding_allocate_composition_data (coding, from);
5412 if (!find_composition (from, to, &start, &end, &prop, obj)
5413 || end > to)
5414 return;
5415 if (start < from
5416 && (!find_composition (end, to, &start, &end, &prop, obj)
5417 || end > to))
5418 return;
5419 coding->composing = COMPOSITION_NO;
5420 do
5421 {
5422 if (COMPOSITION_VALID_P (start, end, prop))
5423 {
5424 enum composition_method method = COMPOSITION_METHOD (prop);
5425 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5426 >= COMPOSITION_DATA_SIZE)
5427 coding_allocate_composition_data (coding, from);
5428 /* For relative composition, we remember start and end
5429 positions, for the other compositions, we also remember
5430 components. */
5431 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5432 if (method != COMPOSITION_RELATIVE)
5433 {
5434 /* We must store a*/
5435 Lisp_Object val, ch;
5436
5437 val = COMPOSITION_COMPONENTS (prop);
5438 if (CONSP (val))
5439 while (CONSP (val))
5440 {
5441 ch = XCAR (val), val = XCDR (val);
5442 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5443 }
5444 else if (VECTORP (val) || STRINGP (val))
5445 {
5446 int len = (VECTORP (val)
5447 ? XVECTOR (val)->size : SCHARS (val));
5448 int i;
5449 for (i = 0; i < len; i++)
5450 {
5451 ch = (STRINGP (val)
5452 ? Faref (val, make_number (i))
5453 : XVECTOR (val)->contents[i]);
5454 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5455 }
5456 }
5457 else /* INTEGERP (val) */
5458 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5459 }
5460 CODING_ADD_COMPOSITION_END (coding, end - from);
5461 }
5462 start = end;
5463 }
5464 while (start < to
5465 && find_composition (start, to, &start, &end, &prop, obj)
5466 && end <= to);
5467
5468 /* Make coding->cmp_data point to the first memory block. */
5469 while (coding->cmp_data->prev)
5470 coding->cmp_data = coding->cmp_data->prev;
5471 coding->cmp_data_start = 0;
5472 }
5473
5474 /* Reflect the saved information about compositions to OBJ.
5475 CODING->cmp_data points to a memory block for the information. OBJ
5476 is a buffer or a string, defaults to the current buffer. */
5477
5478 void
5479 coding_restore_composition (coding, obj)
5480 struct coding_system *coding;
5481 Lisp_Object obj;
5482 {
5483 struct composition_data *cmp_data = coding->cmp_data;
5484
5485 if (!cmp_data)
5486 return;
5487
5488 while (cmp_data->prev)
5489 cmp_data = cmp_data->prev;
5490
5491 while (cmp_data)
5492 {
5493 int i;
5494
5495 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5496 i += cmp_data->data[i])
5497 {
5498 int *data = cmp_data->data + i;
5499 enum composition_method method = (enum composition_method) data[3];
5500 Lisp_Object components;
5501
5502 if (data[0] < 0 || i + data[0] > cmp_data->used)
5503 /* Invalid composition data. */
5504 break;
5505
5506 if (method == COMPOSITION_RELATIVE)
5507 components = Qnil;
5508 else
5509 {
5510 int len = data[0] - 4, j;
5511 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5512
5513 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5514 && len % 2 == 0)
5515 len --;
5516 if (len < 1)
5517 /* Invalid composition data. */
5518 break;
5519 for (j = 0; j < len; j++)
5520 args[j] = make_number (data[4 + j]);
5521 components = (method == COMPOSITION_WITH_ALTCHARS
5522 ? Fstring (len, args)
5523 : Fvector (len, args));
5524 }
5525 compose_text (data[1], data[2], components, Qnil, obj);
5526 }
5527 cmp_data = cmp_data->next;
5528 }
5529 }
5530
5531 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5532 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5533 coding system CODING, and return the status code of code conversion
5534 (currently, this value has no meaning).
5535
5536 How many characters (and bytes) are converted to how many
5537 characters (and bytes) are recorded in members of the structure
5538 CODING.
5539
5540 If REPLACE is nonzero, we do various things as if the original text
5541 is deleted and a new text is inserted. See the comments in
5542 replace_range (insdel.c) to know what we are doing.
5543
5544 If REPLACE is zero, it is assumed that the source text is unibyte.
5545 Otherwise, it is assumed that the source text is multibyte. */
5546
5547 int
5548 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5549 int from, from_byte, to, to_byte, encodep, replace;
5550 struct coding_system *coding;
5551 {
5552 int len = to - from, len_byte = to_byte - from_byte;
5553 int nchars_del = 0, nbytes_del = 0;
5554 int require, inserted, inserted_byte;
5555 int head_skip, tail_skip, total_skip = 0;
5556 Lisp_Object saved_coding_symbol;
5557 int first = 1;
5558 unsigned char *src, *dst;
5559 Lisp_Object deletion;
5560 int orig_point = PT, orig_len = len;
5561 int prev_Z;
5562 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5563
5564 deletion = Qnil;
5565 saved_coding_symbol = coding->symbol;
5566
5567 if (from < PT && PT < to)
5568 {
5569 TEMP_SET_PT_BOTH (from, from_byte);
5570 orig_point = from;
5571 }
5572
5573 if (replace)
5574 {
5575 int saved_from = from;
5576 int saved_inhibit_modification_hooks;
5577
5578 prepare_to_modify_buffer (from, to, &from);
5579 if (saved_from != from)
5580 {
5581 to = from + len;
5582 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5583 len_byte = to_byte - from_byte;
5584 }
5585
5586 /* The code conversion routine can not preserve text properties
5587 for now. So, we must remove all text properties in the
5588 region. Here, we must suppress all modification hooks. */
5589 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5590 inhibit_modification_hooks = 1;
5591 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5592 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5593 }
5594
5595 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5596 {
5597 /* We must detect encoding of text and eol format. */
5598
5599 if (from < GPT && to > GPT)
5600 move_gap_both (from, from_byte);
5601 if (coding->type == coding_type_undecided)
5602 {
5603 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5604 if (coding->type == coding_type_undecided)
5605 {
5606 /* It seems that the text contains only ASCII, but we
5607 should not leave it undecided because the deeper
5608 decoding routine (decode_coding) tries to detect the
5609 encodings again in vain. */
5610 coding->type = coding_type_emacs_mule;
5611 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5612 /* As emacs-mule decoder will handle composition, we
5613 need this setting to allocate coding->cmp_data
5614 later. */
5615 coding->composing = COMPOSITION_NO;
5616 }
5617 }
5618 if (coding->eol_type == CODING_EOL_UNDECIDED
5619 && coding->type != coding_type_ccl)
5620 {
5621 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5622 if (coding->eol_type == CODING_EOL_UNDECIDED)
5623 coding->eol_type = CODING_EOL_LF;
5624 /* We had better recover the original eol format if we
5625 encounter an inconsistent eol format while decoding. */
5626 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5627 }
5628 }
5629
5630 /* Now we convert the text. */
5631
5632 /* For encoding, we must process pre-write-conversion in advance. */
5633 if (! inhibit_pre_post_conversion
5634 && encodep
5635 && SYMBOLP (coding->pre_write_conversion)
5636 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5637 {
5638 /* The function in pre-write-conversion may put a new text in a
5639 new buffer. */
5640 struct buffer *prev = current_buffer;
5641 Lisp_Object new;
5642
5643 record_unwind_protect (code_convert_region_unwind,
5644 Fcons (Vlast_coding_system_used, Qnil));
5645 /* We should not call any more pre-write/post-read-conversion
5646 functions while this pre-write-conversion is running. */
5647 inhibit_pre_post_conversion = 1;
5648 call2 (coding->pre_write_conversion,
5649 make_number (from), make_number (to));
5650 inhibit_pre_post_conversion = 0;
5651 /* Discard the unwind protect. */
5652 specpdl_ptr--;
5653
5654 if (current_buffer != prev)
5655 {
5656 len = ZV - BEGV;
5657 new = Fcurrent_buffer ();
5658 set_buffer_internal_1 (prev);
5659 del_range_2 (from, from_byte, to, to_byte, 0);
5660 TEMP_SET_PT_BOTH (from, from_byte);
5661 insert_from_buffer (XBUFFER (new), 1, len, 0);
5662 Fkill_buffer (new);
5663 if (orig_point >= to)
5664 orig_point += len - orig_len;
5665 else if (orig_point > from)
5666 orig_point = from;
5667 orig_len = len;
5668 to = from + len;
5669 from_byte = CHAR_TO_BYTE (from);
5670 to_byte = CHAR_TO_BYTE (to);
5671 len_byte = to_byte - from_byte;
5672 TEMP_SET_PT_BOTH (from, from_byte);
5673 }
5674 }
5675
5676 if (replace)
5677 {
5678 if (! EQ (current_buffer->undo_list, Qt))
5679 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5680 else
5681 {
5682 nchars_del = to - from;
5683 nbytes_del = to_byte - from_byte;
5684 }
5685 }
5686
5687 if (coding->composing != COMPOSITION_DISABLED)
5688 {
5689 if (encodep)
5690 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5691 else
5692 coding_allocate_composition_data (coding, from);
5693 }
5694
5695 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5696 if we must run CCL program or there are compositions to
5697 encode. */
5698 if (coding->type != coding_type_ccl
5699 && (! coding->cmp_data || coding->cmp_data->used == 0))
5700 {
5701 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5702
5703 if (from < GPT && GPT < to)
5704 move_gap_both (from, from_byte);
5705 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5706 if (from_byte == to_byte
5707 && (encodep || NILP (coding->post_read_conversion))
5708 && ! CODING_REQUIRE_FLUSHING (coding))
5709 {
5710 coding->produced = len_byte;
5711 coding->produced_char = len;
5712 if (!replace)
5713 /* We must record and adjust for this new text now. */
5714 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5715 coding_free_composition_data (coding);
5716 return 0;
5717 }
5718
5719 head_skip = from_byte - from_byte_orig;
5720 tail_skip = to_byte_orig - to_byte;
5721 total_skip = head_skip + tail_skip;
5722 from += head_skip;
5723 to -= tail_skip;
5724 len -= total_skip; len_byte -= total_skip;
5725 }
5726
5727 /* For conversion, we must put the gap before the text in addition to
5728 making the gap larger for efficient decoding. The required gap
5729 size starts from 2000 which is the magic number used in make_gap.
5730 But, after one batch of conversion, it will be incremented if we
5731 find that it is not enough . */
5732 require = 2000;
5733
5734 if (GAP_SIZE < require)
5735 make_gap (require - GAP_SIZE);
5736 move_gap_both (from, from_byte);
5737
5738 inserted = inserted_byte = 0;
5739
5740 GAP_SIZE += len_byte;
5741 ZV -= len;
5742 Z -= len;
5743 ZV_BYTE -= len_byte;
5744 Z_BYTE -= len_byte;
5745
5746 if (GPT - BEG < BEG_UNCHANGED)
5747 BEG_UNCHANGED = GPT - BEG;
5748 if (Z - GPT < END_UNCHANGED)
5749 END_UNCHANGED = Z - GPT;
5750
5751 if (!encodep && coding->src_multibyte)
5752 {
5753 /* Decoding routines expects that the source text is unibyte.
5754 We must convert 8-bit characters of multibyte form to
5755 unibyte. */
5756 int len_byte_orig = len_byte;
5757 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5758 if (len_byte < len_byte_orig)
5759 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5760 len_byte);
5761 coding->src_multibyte = 0;
5762 }
5763
5764 for (;;)
5765 {
5766 int result;
5767
5768 /* The buffer memory is now:
5769 +--------+converted-text+---------+-------original-text-------+---+
5770 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5771 |<---------------------- GAP ----------------------->| */
5772 src = GAP_END_ADDR - len_byte;
5773 dst = GPT_ADDR + inserted_byte;
5774
5775 if (encodep)
5776 result = encode_coding (coding, src, dst, len_byte, 0);
5777 else
5778 {
5779 if (coding->composing != COMPOSITION_DISABLED)
5780 coding->cmp_data->char_offset = from + inserted;
5781 result = decode_coding (coding, src, dst, len_byte, 0);
5782 }
5783
5784 /* The buffer memory is now:
5785 +--------+-------converted-text----+--+------original-text----+---+
5786 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5787 |<---------------------- GAP ----------------------->| */
5788
5789 inserted += coding->produced_char;
5790 inserted_byte += coding->produced;
5791 len_byte -= coding->consumed;
5792
5793 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5794 {
5795 coding_allocate_composition_data (coding, from + inserted);
5796 continue;
5797 }
5798
5799 src += coding->consumed;
5800 dst += coding->produced;
5801
5802 if (result == CODING_FINISH_NORMAL)
5803 {
5804 src += len_byte;
5805 break;
5806 }
5807 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5808 {
5809 unsigned char *pend = dst, *p = pend - inserted_byte;
5810 Lisp_Object eol_type;
5811
5812 /* Encode LFs back to the original eol format (CR or CRLF). */
5813 if (coding->eol_type == CODING_EOL_CR)
5814 {
5815 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5816 }
5817 else
5818 {
5819 int count = 0;
5820
5821 while (p < pend) if (*p++ == '\n') count++;
5822 if (src - dst < count)
5823 {
5824 /* We don't have sufficient room for encoding LFs
5825 back to CRLF. We must record converted and
5826 not-yet-converted text back to the buffer
5827 content, enlarge the gap, then record them out of
5828 the buffer contents again. */
5829 int add = len_byte + inserted_byte;
5830
5831 GAP_SIZE -= add;
5832 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5833 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5834 make_gap (count - GAP_SIZE);
5835 GAP_SIZE += add;
5836 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5837 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5838 /* Don't forget to update SRC, DST, and PEND. */
5839 src = GAP_END_ADDR - len_byte;
5840 dst = GPT_ADDR + inserted_byte;
5841 pend = dst;
5842 }
5843 inserted += count;
5844 inserted_byte += count;
5845 coding->produced += count;
5846 p = dst = pend + count;
5847 while (count)
5848 {
5849 *--p = *--pend;
5850 if (*p == '\n') count--, *--p = '\r';
5851 }
5852 }
5853
5854 /* Suppress eol-format conversion in the further conversion. */
5855 coding->eol_type = CODING_EOL_LF;
5856
5857 /* Set the coding system symbol to that for Unix-like EOL. */
5858 eol_type = Fget (saved_coding_symbol, Qeol_type);
5859 if (VECTORP (eol_type)
5860 && XVECTOR (eol_type)->size == 3
5861 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5862 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5863 else
5864 coding->symbol = saved_coding_symbol;
5865
5866 continue;
5867 }
5868 if (len_byte <= 0)
5869 {
5870 if (coding->type != coding_type_ccl
5871 || coding->mode & CODING_MODE_LAST_BLOCK)
5872 break;
5873 coding->mode |= CODING_MODE_LAST_BLOCK;
5874 continue;
5875 }
5876 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5877 {
5878 /* The source text ends in invalid codes. Let's just
5879 make them valid buffer contents, and finish conversion. */
5880 if (multibyte_p)
5881 {
5882 unsigned char *start = dst;
5883
5884 inserted += len_byte;
5885 while (len_byte--)
5886 {
5887 int c = *src++;
5888 dst += CHAR_STRING (c, dst);
5889 }
5890
5891 inserted_byte += dst - start;
5892 }
5893 else
5894 {
5895 inserted += len_byte;
5896 inserted_byte += len_byte;
5897 while (len_byte--)
5898 *dst++ = *src++;
5899 }
5900 break;
5901 }
5902 if (result == CODING_FINISH_INTERRUPT)
5903 {
5904 /* The conversion procedure was interrupted by a user. */
5905 break;
5906 }
5907 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5908 if (coding->consumed < 1)
5909 {
5910 /* It's quite strange to require more memory without
5911 consuming any bytes. Perhaps CCL program bug. */
5912 break;
5913 }
5914 if (first)
5915 {
5916 /* We have just done the first batch of conversion which was
5917 stopped because of insufficient gap. Let's reconsider the
5918 required gap size (i.e. SRT - DST) now.
5919
5920 We have converted ORIG bytes (== coding->consumed) into
5921 NEW bytes (coding->produced). To convert the remaining
5922 LEN bytes, we may need REQUIRE bytes of gap, where:
5923 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5924 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5925 Here, we are sure that NEW >= ORIG. */
5926
5927 if (coding->produced <= coding->consumed)
5928 {
5929 /* This happens because of CCL-based coding system with
5930 eol-type CRLF. */
5931 require = 0;
5932 }
5933 else
5934 {
5935 float ratio = coding->produced - coding->consumed;
5936 ratio /= coding->consumed;
5937 require = len_byte * ratio;
5938 }
5939 first = 0;
5940 }
5941 if ((src - dst) < (require + 2000))
5942 {
5943 /* See the comment above the previous call of make_gap. */
5944 int add = len_byte + inserted_byte;
5945
5946 GAP_SIZE -= add;
5947 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5948 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5949 make_gap (require + 2000);
5950 GAP_SIZE += add;
5951 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5952 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5953 }
5954 }
5955 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5956
5957 if (encodep && coding->dst_multibyte)
5958 {
5959 /* The output is unibyte. We must convert 8-bit characters to
5960 multibyte form. */
5961 if (inserted_byte * 2 > GAP_SIZE)
5962 {
5963 GAP_SIZE -= inserted_byte;
5964 ZV += inserted_byte; Z += inserted_byte;
5965 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5966 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5967 make_gap (inserted_byte - GAP_SIZE);
5968 GAP_SIZE += inserted_byte;
5969 ZV -= inserted_byte; Z -= inserted_byte;
5970 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5971 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5972 }
5973 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5974 }
5975
5976 /* If we shrank the conversion area, adjust it now. */
5977 if (total_skip > 0)
5978 {
5979 if (tail_skip > 0)
5980 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5981 inserted += total_skip; inserted_byte += total_skip;
5982 GAP_SIZE += total_skip;
5983 GPT -= head_skip; GPT_BYTE -= head_skip;
5984 ZV -= total_skip; ZV_BYTE -= total_skip;
5985 Z -= total_skip; Z_BYTE -= total_skip;
5986 from -= head_skip; from_byte -= head_skip;
5987 to += tail_skip; to_byte += tail_skip;
5988 }
5989
5990 prev_Z = Z;
5991 if (! EQ (current_buffer->undo_list, Qt))
5992 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5993 else
5994 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5995 inserted, inserted_byte);
5996 inserted = Z - prev_Z;
5997
5998 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5999 coding_restore_composition (coding, Fcurrent_buffer ());
6000 coding_free_composition_data (coding);
6001
6002 if (! inhibit_pre_post_conversion
6003 && ! encodep && ! NILP (coding->post_read_conversion))
6004 {
6005 Lisp_Object val;
6006 Lisp_Object saved_coding_system;
6007
6008 if (from != PT)
6009 TEMP_SET_PT_BOTH (from, from_byte);
6010 prev_Z = Z;
6011 record_unwind_protect (code_convert_region_unwind,
6012 Fcons (Vlast_coding_system_used, Qnil));
6013 saved_coding_system = Vlast_coding_system_used;
6014 Vlast_coding_system_used = coding->symbol;
6015 /* We should not call any more pre-write/post-read-conversion
6016 functions while this post-read-conversion is running. */
6017 inhibit_pre_post_conversion = 1;
6018 val = call1 (coding->post_read_conversion, make_number (inserted));
6019 inhibit_pre_post_conversion = 0;
6020 coding->symbol = Vlast_coding_system_used;
6021 Vlast_coding_system_used = saved_coding_system;
6022 /* Discard the unwind protect. */
6023 specpdl_ptr--;
6024 CHECK_NUMBER (val);
6025 inserted += Z - prev_Z;
6026 }
6027
6028 if (orig_point >= from)
6029 {
6030 if (orig_point >= from + orig_len)
6031 orig_point += inserted - orig_len;
6032 else
6033 orig_point = from;
6034 TEMP_SET_PT (orig_point);
6035 }
6036
6037 if (replace)
6038 {
6039 signal_after_change (from, to - from, inserted);
6040 update_compositions (from, from + inserted, CHECK_BORDER);
6041 }
6042
6043 {
6044 coding->consumed = to_byte - from_byte;
6045 coding->consumed_char = to - from;
6046 coding->produced = inserted_byte;
6047 coding->produced_char = inserted;
6048 }
6049
6050 return 0;
6051 }
6052
6053 /* Name (or base name) of work buffer for code conversion. */
6054 static Lisp_Object Vcode_conversion_workbuf_name;
6055
6056 /* Set the current buffer to the working buffer prepared for
6057 code-conversion. MULTIBYTE specifies the multibyteness of the
6058 buffer. Return the buffer we set if it must be killed after use.
6059 Otherwise return Qnil. */
6060
6061 static Lisp_Object
6062 set_conversion_work_buffer (multibyte)
6063 int multibyte;
6064 {
6065 Lisp_Object buffer, buffer_to_kill;
6066 struct buffer *buf;
6067
6068 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6069 buf = XBUFFER (buffer);
6070 if (buf == current_buffer)
6071 {
6072 /* As we are already in the work buffer, we must generate a new
6073 buffer for the work. */
6074 Lisp_Object name;
6075
6076 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6077 buffer = buffer_to_kill = Fget_buffer_create (name);
6078 buf = XBUFFER (buffer);
6079 }
6080 else
6081 buffer_to_kill = Qnil;
6082
6083 delete_all_overlays (buf);
6084 buf->directory = current_buffer->directory;
6085 buf->read_only = Qnil;
6086 buf->filename = Qnil;
6087 buf->undo_list = Qt;
6088 eassert (buf->overlays_before == NULL);
6089 eassert (buf->overlays_after == NULL);
6090 set_buffer_internal (buf);
6091 if (BEG != BEGV || Z != ZV)
6092 Fwiden ();
6093 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6094 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6095 return buffer_to_kill;
6096 }
6097
6098 Lisp_Object
6099 run_pre_post_conversion_on_str (str, coding, encodep)
6100 Lisp_Object str;
6101 struct coding_system *coding;
6102 int encodep;
6103 {
6104 int count = SPECPDL_INDEX ();
6105 struct gcpro gcpro1, gcpro2;
6106 int multibyte = STRING_MULTIBYTE (str);
6107 Lisp_Object old_deactivate_mark;
6108 Lisp_Object buffer_to_kill;
6109 Lisp_Object unwind_arg;
6110
6111 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6112 /* It is not crucial to specbind this. */
6113 old_deactivate_mark = Vdeactivate_mark;
6114 GCPRO2 (str, old_deactivate_mark);
6115
6116 /* We must insert the contents of STR as is without
6117 unibyte<->multibyte conversion. For that, we adjust the
6118 multibyteness of the working buffer to that of STR. */
6119 buffer_to_kill = set_conversion_work_buffer (multibyte);
6120 if (NILP (buffer_to_kill))
6121 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6122 else
6123 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6124 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6125
6126 insert_from_string (str, 0, 0,
6127 SCHARS (str), SBYTES (str), 0);
6128 UNGCPRO;
6129 inhibit_pre_post_conversion = 1;
6130 if (encodep)
6131 {
6132 struct buffer *prev = current_buffer;
6133
6134 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6135 if (prev != current_buffer)
6136 /* We must kill the current buffer too. */
6137 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6138 }
6139 else
6140 {
6141 Vlast_coding_system_used = coding->symbol;
6142 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6143 call1 (coding->post_read_conversion, make_number (Z - BEG));
6144 coding->symbol = Vlast_coding_system_used;
6145 }
6146 inhibit_pre_post_conversion = 0;
6147 Vdeactivate_mark = old_deactivate_mark;
6148 str = make_buffer_string (BEG, Z, 1);
6149 return unbind_to (count, str);
6150 }
6151
6152
6153 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6154 text in *STR. *SIZE is the allocated bytes for STR. As it
6155 is intended that this function is called from encode_terminal_code,
6156 the pre-write-conversion function is run by safe_call and thus
6157 "Error during redisplay: ..." is logged when an error occurs.
6158
6159 Store the resulting text in *STR and set CODING->produced_char and
6160 CODING->produced to the number of characters and bytes
6161 respectively. If the size of *STR is too small, enlarge it by
6162 xrealloc and update *STR and *SIZE. */
6163
6164 void
6165 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6166 unsigned char **str;
6167 int *size, nchars, nbytes;
6168 struct coding_system *coding;
6169 {
6170 struct gcpro gcpro1, gcpro2;
6171 struct buffer *cur = current_buffer;
6172 struct buffer *prev;
6173 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6174 Lisp_Object args[3];
6175 Lisp_Object buffer_to_kill;
6176
6177 /* It is not crucial to specbind this. */
6178 old_deactivate_mark = Vdeactivate_mark;
6179 old_last_coding_system_used = Vlast_coding_system_used;
6180 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6181
6182 /* We must insert the contents of STR as is without
6183 unibyte<->multibyte conversion. For that, we adjust the
6184 multibyteness of the working buffer to that of STR. */
6185 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6186 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6187 UNGCPRO;
6188 inhibit_pre_post_conversion = 1;
6189 prev = current_buffer;
6190 args[0] = coding->pre_write_conversion;
6191 args[1] = make_number (BEG);
6192 args[2] = make_number (Z);
6193 safe_call (3, args);
6194 inhibit_pre_post_conversion = 0;
6195 Vdeactivate_mark = old_deactivate_mark;
6196 Vlast_coding_system_used = old_last_coding_system_used;
6197 coding->produced_char = Z - BEG;
6198 coding->produced = Z_BYTE - BEG_BYTE;
6199 if (coding->produced > *size)
6200 {
6201 *size = coding->produced;
6202 *str = xrealloc (*str, *size);
6203 }
6204 if (BEG < GPT && GPT < Z)
6205 move_gap (BEG);
6206 bcopy (BEG_ADDR, *str, coding->produced);
6207 coding->src_multibyte
6208 = ! NILP (current_buffer->enable_multibyte_characters);
6209 if (prev != current_buffer)
6210 Fkill_buffer (Fcurrent_buffer ());
6211 set_buffer_internal (cur);
6212 if (! NILP (buffer_to_kill))
6213 Fkill_buffer (buffer_to_kill);
6214 }
6215
6216
6217 Lisp_Object
6218 decode_coding_string (str, coding, nocopy)
6219 Lisp_Object str;
6220 struct coding_system *coding;
6221 int nocopy;
6222 {
6223 int len;
6224 struct conversion_buffer buf;
6225 int from, to_byte;
6226 Lisp_Object saved_coding_symbol;
6227 int result;
6228 int require_decoding;
6229 int shrinked_bytes = 0;
6230 Lisp_Object newstr;
6231 int consumed, consumed_char, produced, produced_char;
6232
6233 from = 0;
6234 to_byte = SBYTES (str);
6235
6236 saved_coding_symbol = coding->symbol;
6237 coding->src_multibyte = STRING_MULTIBYTE (str);
6238 coding->dst_multibyte = 1;
6239 if (CODING_REQUIRE_DETECTION (coding))
6240 {
6241 /* See the comments in code_convert_region. */
6242 if (coding->type == coding_type_undecided)
6243 {
6244 detect_coding (coding, SDATA (str), to_byte);
6245 if (coding->type == coding_type_undecided)
6246 {
6247 coding->type = coding_type_emacs_mule;
6248 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6249 /* As emacs-mule decoder will handle composition, we
6250 need this setting to allocate coding->cmp_data
6251 later. */
6252 coding->composing = COMPOSITION_NO;
6253 }
6254 }
6255 if (coding->eol_type == CODING_EOL_UNDECIDED
6256 && coding->type != coding_type_ccl)
6257 {
6258 saved_coding_symbol = coding->symbol;
6259 detect_eol (coding, SDATA (str), to_byte);
6260 if (coding->eol_type == CODING_EOL_UNDECIDED)
6261 coding->eol_type = CODING_EOL_LF;
6262 /* We had better recover the original eol format if we
6263 encounter an inconsistent eol format while decoding. */
6264 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6265 }
6266 }
6267
6268 if (coding->type == coding_type_no_conversion
6269 || coding->type == coding_type_raw_text)
6270 coding->dst_multibyte = 0;
6271
6272 require_decoding = CODING_REQUIRE_DECODING (coding);
6273
6274 if (STRING_MULTIBYTE (str))
6275 {
6276 /* Decoding routines expect the source text to be unibyte. */
6277 str = Fstring_as_unibyte (str);
6278 to_byte = SBYTES (str);
6279 nocopy = 1;
6280 coding->src_multibyte = 0;
6281 }
6282
6283 /* Try to skip the heading and tailing ASCIIs. */
6284 if (require_decoding && coding->type != coding_type_ccl)
6285 {
6286 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6287 0);
6288 if (from == to_byte)
6289 require_decoding = 0;
6290 shrinked_bytes = from + (SBYTES (str) - to_byte);
6291 }
6292
6293 if (!require_decoding
6294 && !(SYMBOLP (coding->post_read_conversion)
6295 && !NILP (Ffboundp (coding->post_read_conversion))))
6296 {
6297 coding->consumed = SBYTES (str);
6298 coding->consumed_char = SCHARS (str);
6299 if (coding->dst_multibyte)
6300 {
6301 str = Fstring_as_multibyte (str);
6302 nocopy = 1;
6303 }
6304 coding->produced = SBYTES (str);
6305 coding->produced_char = SCHARS (str);
6306 return (nocopy ? str : Fcopy_sequence (str));
6307 }
6308
6309 if (coding->composing != COMPOSITION_DISABLED)
6310 coding_allocate_composition_data (coding, from);
6311 len = decoding_buffer_size (coding, to_byte - from);
6312 allocate_conversion_buffer (buf, len);
6313
6314 consumed = consumed_char = produced = produced_char = 0;
6315 while (1)
6316 {
6317 result = decode_coding (coding, SDATA (str) + from + consumed,
6318 buf.data + produced, to_byte - from - consumed,
6319 buf.size - produced);
6320 consumed += coding->consumed;
6321 consumed_char += coding->consumed_char;
6322 produced += coding->produced;
6323 produced_char += coding->produced_char;
6324 if (result == CODING_FINISH_NORMAL
6325 || result == CODING_FINISH_INTERRUPT
6326 || (result == CODING_FINISH_INSUFFICIENT_SRC
6327 && coding->consumed == 0))
6328 break;
6329 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6330 coding_allocate_composition_data (coding, from + produced_char);
6331 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6332 extend_conversion_buffer (&buf);
6333 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6334 {
6335 Lisp_Object eol_type;
6336
6337 /* Recover the original EOL format. */
6338 if (coding->eol_type == CODING_EOL_CR)
6339 {
6340 unsigned char *p;
6341 for (p = buf.data; p < buf.data + produced; p++)
6342 if (*p == '\n') *p = '\r';
6343 }
6344 else if (coding->eol_type == CODING_EOL_CRLF)
6345 {
6346 int num_eol = 0;
6347 unsigned char *p0, *p1;
6348 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6349 if (*p0 == '\n') num_eol++;
6350 if (produced + num_eol >= buf.size)
6351 extend_conversion_buffer (&buf);
6352 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6353 {
6354 *--p1 = *--p0;
6355 if (*p0 == '\n') *--p1 = '\r';
6356 }
6357 produced += num_eol;
6358 produced_char += num_eol;
6359 }
6360 /* Suppress eol-format conversion in the further conversion. */
6361 coding->eol_type = CODING_EOL_LF;
6362
6363 /* Set the coding system symbol to that for Unix-like EOL. */
6364 eol_type = Fget (saved_coding_symbol, Qeol_type);
6365 if (VECTORP (eol_type)
6366 && XVECTOR (eol_type)->size == 3
6367 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6368 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6369 else
6370 coding->symbol = saved_coding_symbol;
6371
6372
6373 }
6374 }
6375
6376 coding->consumed = consumed;
6377 coding->consumed_char = consumed_char;
6378 coding->produced = produced;
6379 coding->produced_char = produced_char;
6380
6381 if (coding->dst_multibyte)
6382 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6383 produced + shrinked_bytes);
6384 else
6385 newstr = make_uninit_string (produced + shrinked_bytes);
6386 if (from > 0)
6387 STRING_COPYIN (newstr, 0, SDATA (str), from);
6388 STRING_COPYIN (newstr, from, buf.data, produced);
6389 if (shrinked_bytes > from)
6390 STRING_COPYIN (newstr, from + produced,
6391 SDATA (str) + to_byte,
6392 shrinked_bytes - from);
6393 free_conversion_buffer (&buf);
6394
6395 coding->consumed += shrinked_bytes;
6396 coding->consumed_char += shrinked_bytes;
6397 coding->produced += shrinked_bytes;
6398 coding->produced_char += shrinked_bytes;
6399
6400 if (coding->cmp_data && coding->cmp_data->used)
6401 coding_restore_composition (coding, newstr);
6402 coding_free_composition_data (coding);
6403
6404 if (SYMBOLP (coding->post_read_conversion)
6405 && !NILP (Ffboundp (coding->post_read_conversion)))
6406 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6407
6408 return newstr;
6409 }
6410
6411 Lisp_Object
6412 encode_coding_string (str, coding, nocopy)
6413 Lisp_Object str;
6414 struct coding_system *coding;
6415 int nocopy;
6416 {
6417 int len;
6418 struct conversion_buffer buf;
6419 int from, to, to_byte;
6420 int result;
6421 int shrinked_bytes = 0;
6422 Lisp_Object newstr;
6423 int consumed, consumed_char, produced, produced_char;
6424
6425 if (SYMBOLP (coding->pre_write_conversion)
6426 && !NILP (Ffboundp (coding->pre_write_conversion)))
6427 {
6428 str = run_pre_post_conversion_on_str (str, coding, 1);
6429 /* As STR is just newly generated, we don't have to copy it
6430 anymore. */
6431 nocopy = 1;
6432 }
6433
6434 from = 0;
6435 to = SCHARS (str);
6436 to_byte = SBYTES (str);
6437
6438 /* Encoding routines determine the multibyteness of the source text
6439 by coding->src_multibyte. */
6440 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6441 coding->dst_multibyte = 0;
6442 if (! CODING_REQUIRE_ENCODING (coding))
6443 goto no_need_of_encoding;
6444
6445 if (coding->composing != COMPOSITION_DISABLED)
6446 coding_save_composition (coding, from, to, str);
6447
6448 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6449 if we must run CCL program or there are compositions to
6450 encode. */
6451 if (coding->type != coding_type_ccl
6452 && (! coding->cmp_data || coding->cmp_data->used == 0))
6453 {
6454 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6455 1);
6456 if (from == to_byte)
6457 {
6458 coding_free_composition_data (coding);
6459 goto no_need_of_encoding;
6460 }
6461 shrinked_bytes = from + (SBYTES (str) - to_byte);
6462 }
6463
6464 len = encoding_buffer_size (coding, to_byte - from);
6465 allocate_conversion_buffer (buf, len);
6466
6467 consumed = consumed_char = produced = produced_char = 0;
6468 while (1)
6469 {
6470 result = encode_coding (coding, SDATA (str) + from + consumed,
6471 buf.data + produced, to_byte - from - consumed,
6472 buf.size - produced);
6473 consumed += coding->consumed;
6474 consumed_char += coding->consumed_char;
6475 produced += coding->produced;
6476 produced_char += coding->produced_char;
6477 if (result == CODING_FINISH_NORMAL
6478 || result == CODING_FINISH_INTERRUPT
6479 || (result == CODING_FINISH_INSUFFICIENT_SRC
6480 && coding->consumed == 0))
6481 break;
6482 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6483 extend_conversion_buffer (&buf);
6484 }
6485
6486 coding->consumed = consumed;
6487 coding->consumed_char = consumed_char;
6488 coding->produced = produced;
6489 coding->produced_char = produced_char;
6490
6491 newstr = make_uninit_string (produced + shrinked_bytes);
6492 if (from > 0)
6493 STRING_COPYIN (newstr, 0, SDATA (str), from);
6494 STRING_COPYIN (newstr, from, buf.data, produced);
6495 if (shrinked_bytes > from)
6496 STRING_COPYIN (newstr, from + produced,
6497 SDATA (str) + to_byte,
6498 shrinked_bytes - from);
6499
6500 free_conversion_buffer (&buf);
6501 coding_free_composition_data (coding);
6502
6503 return newstr;
6504
6505 no_need_of_encoding:
6506 coding->consumed = SBYTES (str);
6507 coding->consumed_char = SCHARS (str);
6508 if (STRING_MULTIBYTE (str))
6509 {
6510 if (nocopy)
6511 /* We are sure that STR doesn't contain a multibyte
6512 character. */
6513 STRING_SET_UNIBYTE (str);
6514 else
6515 {
6516 str = Fstring_as_unibyte (str);
6517 nocopy = 1;
6518 }
6519 }
6520 coding->produced = SBYTES (str);
6521 coding->produced_char = SCHARS (str);
6522 return (nocopy ? str : Fcopy_sequence (str));
6523 }
6524
6525 \f
6526 #ifdef emacs
6527 /*** 8. Emacs Lisp library functions ***/
6528
6529 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6530 doc: /* Return t if OBJECT is nil or a coding-system.
6531 See the documentation of `make-coding-system' for information
6532 about coding-system objects. */)
6533 (obj)
6534 Lisp_Object obj;
6535 {
6536 if (NILP (obj))
6537 return Qt;
6538 if (!SYMBOLP (obj))
6539 return Qnil;
6540 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6541 return Qt;
6542 /* Get coding-spec vector for OBJ. */
6543 obj = Fget (obj, Qcoding_system);
6544 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6545 ? Qt : Qnil);
6546 }
6547
6548 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6549 Sread_non_nil_coding_system, 1, 1, 0,
6550 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6551 (prompt)
6552 Lisp_Object prompt;
6553 {
6554 Lisp_Object val;
6555 do
6556 {
6557 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6558 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6559 }
6560 while (SCHARS (val) == 0);
6561 return (Fintern (val, Qnil));
6562 }
6563
6564 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6565 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6566 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6567 (prompt, default_coding_system)
6568 Lisp_Object prompt, default_coding_system;
6569 {
6570 Lisp_Object val;
6571 if (SYMBOLP (default_coding_system))
6572 default_coding_system = SYMBOL_NAME (default_coding_system);
6573 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6574 Qt, Qnil, Qcoding_system_history,
6575 default_coding_system, Qnil);
6576 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6577 }
6578
6579 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6580 1, 1, 0,
6581 doc: /* Check validity of CODING-SYSTEM.
6582 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6583 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6584 The value of this property should be a vector of length 5. */)
6585 (coding_system)
6586 Lisp_Object coding_system;
6587 {
6588 Lisp_Object define_form;
6589
6590 define_form = Fget (coding_system, Qcoding_system_define_form);
6591 if (! NILP (define_form))
6592 {
6593 Fput (coding_system, Qcoding_system_define_form, Qnil);
6594 safe_eval (define_form);
6595 }
6596 if (!NILP (Fcoding_system_p (coding_system)))
6597 return coding_system;
6598 while (1)
6599 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6600 }
6601 \f
6602 Lisp_Object
6603 detect_coding_system (src, src_bytes, highest, multibytep)
6604 const unsigned char *src;
6605 int src_bytes, highest;
6606 int multibytep;
6607 {
6608 int coding_mask, eol_type;
6609 Lisp_Object val, tmp;
6610 int dummy;
6611
6612 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6613 eol_type = detect_eol_type (src, src_bytes, &dummy);
6614 if (eol_type == CODING_EOL_INCONSISTENT)
6615 eol_type = CODING_EOL_UNDECIDED;
6616
6617 if (!coding_mask)
6618 {
6619 val = Qundecided;
6620 if (eol_type != CODING_EOL_UNDECIDED)
6621 {
6622 Lisp_Object val2;
6623 val2 = Fget (Qundecided, Qeol_type);
6624 if (VECTORP (val2))
6625 val = XVECTOR (val2)->contents[eol_type];
6626 }
6627 return (highest ? val : Fcons (val, Qnil));
6628 }
6629
6630 /* At first, gather possible coding systems in VAL. */
6631 val = Qnil;
6632 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6633 {
6634 Lisp_Object category_val, category_index;
6635
6636 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6637 category_val = Fsymbol_value (XCAR (tmp));
6638 if (!NILP (category_val)
6639 && NATNUMP (category_index)
6640 && (coding_mask & (1 << XFASTINT (category_index))))
6641 {
6642 val = Fcons (category_val, val);
6643 if (highest)
6644 break;
6645 }
6646 }
6647 if (!highest)
6648 val = Fnreverse (val);
6649
6650 /* Then, replace the elements with subsidiary coding systems. */
6651 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6652 {
6653 if (eol_type != CODING_EOL_UNDECIDED
6654 && eol_type != CODING_EOL_INCONSISTENT)
6655 {
6656 Lisp_Object eol;
6657 eol = Fget (XCAR (tmp), Qeol_type);
6658 if (VECTORP (eol))
6659 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6660 }
6661 }
6662 return (highest ? XCAR (val) : val);
6663 }
6664
6665 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6666 2, 3, 0,
6667 doc: /* Detect how the byte sequence in the region is encoded.
6668 Return a list of possible coding systems used on decoding a byte
6669 sequence containing the bytes in the region between START and END when
6670 the coding system `undecided' is specified. The list is ordered by
6671 priority decided in the current language environment.
6672
6673 If only ASCII characters are found, it returns a list of single element
6674 `undecided' or its subsidiary coding system according to a detected
6675 end-of-line format.
6676
6677 If optional argument HIGHEST is non-nil, return the coding system of
6678 highest priority. */)
6679 (start, end, highest)
6680 Lisp_Object start, end, highest;
6681 {
6682 int from, to;
6683 int from_byte, to_byte;
6684 int include_anchor_byte = 0;
6685
6686 CHECK_NUMBER_COERCE_MARKER (start);
6687 CHECK_NUMBER_COERCE_MARKER (end);
6688
6689 validate_region (&start, &end);
6690 from = XINT (start), to = XINT (end);
6691 from_byte = CHAR_TO_BYTE (from);
6692 to_byte = CHAR_TO_BYTE (to);
6693
6694 if (from < GPT && to >= GPT)
6695 move_gap_both (to, to_byte);
6696 /* If we an anchor byte `\0' follows the region, we include it in
6697 the detecting source. Then code detectors can handle the tailing
6698 byte sequence more accurately.
6699
6700 Fix me: This is not a perfect solution. It is better that we
6701 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6702 */
6703 if (to == Z || (to == GPT && GAP_SIZE > 0))
6704 include_anchor_byte = 1;
6705 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6706 to_byte - from_byte + include_anchor_byte,
6707 !NILP (highest),
6708 !NILP (current_buffer
6709 ->enable_multibyte_characters));
6710 }
6711
6712 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6713 1, 2, 0,
6714 doc: /* Detect how the byte sequence in STRING is encoded.
6715 Return a list of possible coding systems used on decoding a byte
6716 sequence containing the bytes in STRING when the coding system
6717 `undecided' is specified. The list is ordered by priority decided in
6718 the current language environment.
6719
6720 If only ASCII characters are found, it returns a list of single element
6721 `undecided' or its subsidiary coding system according to a detected
6722 end-of-line format.
6723
6724 If optional argument HIGHEST is non-nil, return the coding system of
6725 highest priority. */)
6726 (string, highest)
6727 Lisp_Object string, highest;
6728 {
6729 CHECK_STRING (string);
6730
6731 return detect_coding_system (SDATA (string),
6732 /* "+ 1" is to include the anchor byte
6733 `\0'. With this, code detectors can
6734 handle the tailing bytes more
6735 accurately. */
6736 SBYTES (string) + 1,
6737 !NILP (highest),
6738 STRING_MULTIBYTE (string));
6739 }
6740
6741 /* Subroutine for Ffind_coding_systems_region_internal.
6742
6743 Return a list of coding systems that safely encode the multibyte
6744 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6745 possible coding systems. If it is nil, it means that we have not
6746 yet found any coding systems.
6747
6748 WORK_TABLE a char-table of which element is set to t once the
6749 element is looked up.
6750
6751 If a non-ASCII single byte char is found, set
6752 *single_byte_char_found to 1. */
6753
6754 static Lisp_Object
6755 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6756 unsigned char *p, *pend;
6757 Lisp_Object safe_codings, work_table;
6758 int *single_byte_char_found;
6759 {
6760 int c, len;
6761 Lisp_Object val, ch;
6762 Lisp_Object prev, tail;
6763
6764 if (NILP (safe_codings))
6765 goto done_safe_codings;
6766 while (p < pend)
6767 {
6768 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6769 p += len;
6770 if (ASCII_BYTE_P (c))
6771 /* We can ignore ASCII characters here. */
6772 continue;
6773 if (SINGLE_BYTE_CHAR_P (c))
6774 *single_byte_char_found = 1;
6775 /* Check the safe coding systems for C. */
6776 ch = make_number (c);
6777 val = Faref (work_table, ch);
6778 if (EQ (val, Qt))
6779 /* This element was already checked. Ignore it. */
6780 continue;
6781 /* Remember that we checked this element. */
6782 Faset (work_table, ch, Qt);
6783
6784 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6785 {
6786 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6787 int encodable;
6788
6789 elt = XCAR (tail);
6790 if (CONSP (XCDR (elt)))
6791 {
6792 /* This entry has this format now:
6793 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6794 ACCEPT-LATIN-EXTRA ) */
6795 val = XCDR (elt);
6796 encodable = ! NILP (Faref (XCAR (val), ch));
6797 if (! encodable)
6798 {
6799 val = XCDR (val);
6800 translation_table = XCAR (val);
6801 hash_table = XCAR (XCDR (val));
6802 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6803 }
6804 }
6805 else
6806 {
6807 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6808 encodable = ! NILP (Faref (XCDR (elt), ch));
6809 if (! encodable)
6810 {
6811 /* Transform the format to:
6812 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6813 ACCEPT-LATIN-EXTRA ) */
6814 val = Fget (XCAR (elt), Qcoding_system);
6815 translation_table
6816 = Fplist_get (AREF (val, 3),
6817 Qtranslation_table_for_encode);
6818 if (SYMBOLP (translation_table))
6819 translation_table = Fget (translation_table,
6820 Qtranslation_table);
6821 hash_table
6822 = (CHAR_TABLE_P (translation_table)
6823 ? XCHAR_TABLE (translation_table)->extras[1]
6824 : Qnil);
6825 accept_latin_extra
6826 = ((EQ (AREF (val, 0), make_number (2))
6827 && VECTORP (AREF (val, 4)))
6828 ? AREF (AREF (val, 4), 16)
6829 : Qnil);
6830 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6831 translation_table, hash_table,
6832 accept_latin_extra));
6833 }
6834 }
6835
6836 if (! encodable
6837 && ((CHAR_TABLE_P (translation_table)
6838 && ! NILP (Faref (translation_table, ch)))
6839 || (HASH_TABLE_P (hash_table)
6840 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6841 || (SINGLE_BYTE_CHAR_P (c)
6842 && ! NILP (accept_latin_extra)
6843 && VECTORP (Vlatin_extra_code_table)
6844 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6845 encodable = 1;
6846 if (encodable)
6847 prev = tail;
6848 else
6849 {
6850 /* Exclude this coding system from SAFE_CODINGS. */
6851 if (EQ (tail, safe_codings))
6852 {
6853 safe_codings = XCDR (safe_codings);
6854 if (NILP (safe_codings))
6855 goto done_safe_codings;
6856 }
6857 else
6858 XSETCDR (prev, XCDR (tail));
6859 }
6860 }
6861 }
6862
6863 done_safe_codings:
6864 /* If the above loop was terminated before P reaches PEND, it means
6865 SAFE_CODINGS was set to nil. If we have not yet found an
6866 non-ASCII single-byte char, check it now. */
6867 if (! *single_byte_char_found)
6868 while (p < pend)
6869 {
6870 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6871 p += len;
6872 if (! ASCII_BYTE_P (c)
6873 && SINGLE_BYTE_CHAR_P (c))
6874 {
6875 *single_byte_char_found = 1;
6876 break;
6877 }
6878 }
6879 return safe_codings;
6880 }
6881
6882 DEFUN ("find-coding-systems-region-internal",
6883 Ffind_coding_systems_region_internal,
6884 Sfind_coding_systems_region_internal, 2, 2, 0,
6885 doc: /* Internal use only. */)
6886 (start, end)
6887 Lisp_Object start, end;
6888 {
6889 Lisp_Object work_table, safe_codings;
6890 int non_ascii_p = 0;
6891 int single_byte_char_found = 0;
6892 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6893
6894 if (STRINGP (start))
6895 {
6896 if (!STRING_MULTIBYTE (start))
6897 return Qt;
6898 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6899 p2 = p2end = p1end;
6900 if (SCHARS (start) != SBYTES (start))
6901 non_ascii_p = 1;
6902 }
6903 else
6904 {
6905 int from, to, stop;
6906
6907 CHECK_NUMBER_COERCE_MARKER (start);
6908 CHECK_NUMBER_COERCE_MARKER (end);
6909 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6910 args_out_of_range (start, end);
6911 if (NILP (current_buffer->enable_multibyte_characters))
6912 return Qt;
6913 from = CHAR_TO_BYTE (XINT (start));
6914 to = CHAR_TO_BYTE (XINT (end));
6915 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6916 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6917 if (stop == to)
6918 p2 = p2end = p1end;
6919 else
6920 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6921 if (XINT (end) - XINT (start) != to - from)
6922 non_ascii_p = 1;
6923 }
6924
6925 if (!non_ascii_p)
6926 {
6927 /* We are sure that the text contains no multibyte character.
6928 Check if it contains eight-bit-graphic. */
6929 p = p1;
6930 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6931 if (p == p1end)
6932 {
6933 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6934 if (p == p2end)
6935 return Qt;
6936 }
6937 }
6938
6939 /* The text contains non-ASCII characters. */
6940
6941 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6942 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6943
6944 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6945 &single_byte_char_found);
6946 if (p2 < p2end)
6947 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6948 &single_byte_char_found);
6949 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6950 safe_codings = Qt;
6951 else
6952 {
6953 /* Turn safe_codings to a list of coding systems... */
6954 Lisp_Object val;
6955
6956 if (single_byte_char_found)
6957 /* ... and append these for eight-bit chars. */
6958 val = Fcons (Qraw_text,
6959 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6960 else
6961 /* ... and append generic coding systems. */
6962 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6963
6964 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6965 val = Fcons (XCAR (XCAR (safe_codings)), val);
6966 safe_codings = val;
6967 }
6968
6969 return safe_codings;
6970 }
6971
6972
6973 /* Search from position POS for such characters that are unencodable
6974 accoding to SAFE_CHARS, and return a list of their positions. P
6975 points where in the memory the character at POS exists. Limit the
6976 search at PEND or when Nth unencodable characters are found.
6977
6978 If SAFE_CHARS is a char table, an element for an unencodable
6979 character is nil.
6980
6981 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6982
6983 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6984 eight-bit-graphic characters are unencodable. */
6985
6986 static Lisp_Object
6987 unencodable_char_position (safe_chars, pos, p, pend, n)
6988 Lisp_Object safe_chars;
6989 int pos;
6990 unsigned char *p, *pend;
6991 int n;
6992 {
6993 Lisp_Object pos_list;
6994
6995 pos_list = Qnil;
6996 while (p < pend)
6997 {
6998 int len;
6999 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7000
7001 if (c >= 128
7002 && (CHAR_TABLE_P (safe_chars)
7003 ? NILP (CHAR_TABLE_REF (safe_chars, c))
7004 : (NILP (safe_chars) || c < 256)))
7005 {
7006 pos_list = Fcons (make_number (pos), pos_list);
7007 if (--n <= 0)
7008 break;
7009 }
7010 pos++;
7011 p += len;
7012 }
7013 return Fnreverse (pos_list);
7014 }
7015
7016
7017 DEFUN ("unencodable-char-position", Funencodable_char_position,
7018 Sunencodable_char_position, 3, 5, 0,
7019 doc: /*
7020 Return position of first un-encodable character in a region.
7021 START and END specfiy the region and CODING-SYSTEM specifies the
7022 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7023
7024 If optional 4th argument COUNT is non-nil, it specifies at most how
7025 many un-encodable characters to search. In this case, the value is a
7026 list of positions.
7027
7028 If optional 5th argument STRING is non-nil, it is a string to search
7029 for un-encodable characters. In that case, START and END are indexes
7030 to the string. */)
7031 (start, end, coding_system, count, string)
7032 Lisp_Object start, end, coding_system, count, string;
7033 {
7034 int n;
7035 Lisp_Object safe_chars;
7036 struct coding_system coding;
7037 Lisp_Object positions;
7038 int from, to;
7039 unsigned char *p, *pend;
7040
7041 if (NILP (string))
7042 {
7043 validate_region (&start, &end);
7044 from = XINT (start);
7045 to = XINT (end);
7046 if (NILP (current_buffer->enable_multibyte_characters))
7047 return Qnil;
7048 p = CHAR_POS_ADDR (from);
7049 if (to == GPT)
7050 pend = GPT_ADDR;
7051 else
7052 pend = CHAR_POS_ADDR (to);
7053 }
7054 else
7055 {
7056 CHECK_STRING (string);
7057 CHECK_NATNUM (start);
7058 CHECK_NATNUM (end);
7059 from = XINT (start);
7060 to = XINT (end);
7061 if (from > to
7062 || to > SCHARS (string))
7063 args_out_of_range_3 (string, start, end);
7064 if (! STRING_MULTIBYTE (string))
7065 return Qnil;
7066 p = SDATA (string) + string_char_to_byte (string, from);
7067 pend = SDATA (string) + string_char_to_byte (string, to);
7068 }
7069
7070 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7071
7072 if (NILP (count))
7073 n = 1;
7074 else
7075 {
7076 CHECK_NATNUM (count);
7077 n = XINT (count);
7078 }
7079
7080 if (coding.type == coding_type_no_conversion
7081 || coding.type == coding_type_raw_text)
7082 return Qnil;
7083
7084 if (coding.type == coding_type_undecided)
7085 safe_chars = Qnil;
7086 else
7087 safe_chars = coding_safe_chars (coding_system);
7088
7089 if (STRINGP (string)
7090 || from >= GPT || to <= GPT)
7091 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7092 else
7093 {
7094 Lisp_Object args[2];
7095
7096 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7097 n -= XINT (Flength (args[0]));
7098 if (n <= 0)
7099 positions = args[0];
7100 else
7101 {
7102 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7103 pend, n);
7104 positions = Fappend (2, args);
7105 }
7106 }
7107
7108 return (NILP (count) ? Fcar (positions) : positions);
7109 }
7110
7111
7112 Lisp_Object
7113 code_convert_region1 (start, end, coding_system, encodep)
7114 Lisp_Object start, end, coding_system;
7115 int encodep;
7116 {
7117 struct coding_system coding;
7118 int from, to;
7119
7120 CHECK_NUMBER_COERCE_MARKER (start);
7121 CHECK_NUMBER_COERCE_MARKER (end);
7122 CHECK_SYMBOL (coding_system);
7123
7124 validate_region (&start, &end);
7125 from = XFASTINT (start);
7126 to = XFASTINT (end);
7127
7128 if (NILP (coding_system))
7129 return make_number (to - from);
7130
7131 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7132 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7133
7134 coding.mode |= CODING_MODE_LAST_BLOCK;
7135 coding.src_multibyte = coding.dst_multibyte
7136 = !NILP (current_buffer->enable_multibyte_characters);
7137 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7138 &coding, encodep, 1);
7139 Vlast_coding_system_used = coding.symbol;
7140 return make_number (coding.produced_char);
7141 }
7142
7143 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7144 3, 3, "r\nzCoding system: ",
7145 doc: /* Decode the current region from the specified coding system.
7146 When called from a program, takes three arguments:
7147 START, END, and CODING-SYSTEM. START and END are buffer positions.
7148 This function sets `last-coding-system-used' to the precise coding system
7149 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7150 not fully specified.)
7151 It returns the length of the decoded text. */)
7152 (start, end, coding_system)
7153 Lisp_Object start, end, coding_system;
7154 {
7155 return code_convert_region1 (start, end, coding_system, 0);
7156 }
7157
7158 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7159 3, 3, "r\nzCoding system: ",
7160 doc: /* Encode the current region into the specified coding system.
7161 When called from a program, takes three arguments:
7162 START, END, and CODING-SYSTEM. START and END are buffer positions.
7163 This function sets `last-coding-system-used' to the precise coding system
7164 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7165 not fully specified.)
7166 It returns the length of the encoded text. */)
7167 (start, end, coding_system)
7168 Lisp_Object start, end, coding_system;
7169 {
7170 return code_convert_region1 (start, end, coding_system, 1);
7171 }
7172
7173 Lisp_Object
7174 code_convert_string1 (string, coding_system, nocopy, encodep)
7175 Lisp_Object string, coding_system, nocopy;
7176 int encodep;
7177 {
7178 struct coding_system coding;
7179
7180 CHECK_STRING (string);
7181 CHECK_SYMBOL (coding_system);
7182
7183 if (NILP (coding_system))
7184 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7185
7186 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7187 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7188
7189 coding.mode |= CODING_MODE_LAST_BLOCK;
7190 string = (encodep
7191 ? encode_coding_string (string, &coding, !NILP (nocopy))
7192 : decode_coding_string (string, &coding, !NILP (nocopy)));
7193 Vlast_coding_system_used = coding.symbol;
7194
7195 return string;
7196 }
7197
7198 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7199 2, 3, 0,
7200 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the decoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.) */)
7206 (string, coding_system, nocopy)
7207 Lisp_Object string, coding_system, nocopy;
7208 {
7209 return code_convert_string1 (string, coding_system, nocopy, 0);
7210 }
7211
7212 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7213 2, 3, 0,
7214 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7215 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7216 if the encoding operation is trivial.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.) */)
7220 (string, coding_system, nocopy)
7221 Lisp_Object string, coding_system, nocopy;
7222 {
7223 return code_convert_string1 (string, coding_system, nocopy, 1);
7224 }
7225
7226 /* Encode or decode STRING according to CODING_SYSTEM.
7227 Do not set Vlast_coding_system_used.
7228
7229 This function is called only from macros DECODE_FILE and
7230 ENCODE_FILE, thus we ignore character composition. */
7231
7232 Lisp_Object
7233 code_convert_string_norecord (string, coding_system, encodep)
7234 Lisp_Object string, coding_system;
7235 int encodep;
7236 {
7237 struct coding_system coding;
7238
7239 CHECK_STRING (string);
7240 CHECK_SYMBOL (coding_system);
7241
7242 if (NILP (coding_system))
7243 return string;
7244
7245 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7246 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7247
7248 coding.composing = COMPOSITION_DISABLED;
7249 coding.mode |= CODING_MODE_LAST_BLOCK;
7250 return (encodep
7251 ? encode_coding_string (string, &coding, 1)
7252 : decode_coding_string (string, &coding, 1));
7253 }
7254 \f
7255 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7256 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7257 Return the corresponding character. */)
7258 (code)
7259 Lisp_Object code;
7260 {
7261 unsigned char c1, c2, s1, s2;
7262 Lisp_Object val;
7263
7264 CHECK_NUMBER (code);
7265 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7266 if (s1 == 0)
7267 {
7268 if (s2 < 0x80)
7269 XSETFASTINT (val, s2);
7270 else if (s2 >= 0xA0 || s2 <= 0xDF)
7271 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7272 else
7273 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7274 }
7275 else
7276 {
7277 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7278 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7279 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7280 DECODE_SJIS (s1, s2, c1, c2);
7281 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7282 }
7283 return val;
7284 }
7285
7286 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7287 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7288 Return the corresponding code in SJIS. */)
7289 (ch)
7290 Lisp_Object ch;
7291 {
7292 int charset, c1, c2, s1, s2;
7293 Lisp_Object val;
7294
7295 CHECK_NUMBER (ch);
7296 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7297 if (charset == CHARSET_ASCII)
7298 {
7299 val = ch;
7300 }
7301 else if (charset == charset_jisx0208
7302 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7303 {
7304 ENCODE_SJIS (c1, c2, s1, s2);
7305 XSETFASTINT (val, (s1 << 8) | s2);
7306 }
7307 else if (charset == charset_katakana_jisx0201
7308 && c1 > 0x20 && c2 < 0xE0)
7309 {
7310 XSETFASTINT (val, c1 | 0x80);
7311 }
7312 else
7313 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7314 return val;
7315 }
7316
7317 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7318 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7319 Return the corresponding character. */)
7320 (code)
7321 Lisp_Object code;
7322 {
7323 int charset;
7324 unsigned char b1, b2, c1, c2;
7325 Lisp_Object val;
7326
7327 CHECK_NUMBER (code);
7328 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7329 if (b1 == 0)
7330 {
7331 if (b2 >= 0x80)
7332 error ("Invalid BIG5 code: %x", XFASTINT (code));
7333 val = code;
7334 }
7335 else
7336 {
7337 if ((b1 < 0xA1 || b1 > 0xFE)
7338 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7339 error ("Invalid BIG5 code: %x", XFASTINT (code));
7340 DECODE_BIG5 (b1, b2, charset, c1, c2);
7341 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7342 }
7343 return val;
7344 }
7345
7346 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7347 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7348 Return the corresponding character code in Big5. */)
7349 (ch)
7350 Lisp_Object ch;
7351 {
7352 int charset, c1, c2, b1, b2;
7353 Lisp_Object val;
7354
7355 CHECK_NUMBER (ch);
7356 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7357 if (charset == CHARSET_ASCII)
7358 {
7359 val = ch;
7360 }
7361 else if ((charset == charset_big5_1
7362 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7363 || (charset == charset_big5_2
7364 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7365 {
7366 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7367 XSETFASTINT (val, (b1 << 8) | b2);
7368 }
7369 else
7370 error ("Can't encode to Big5: %d", XFASTINT (ch));
7371 return val;
7372 }
7373 \f
7374 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7375 Sset_terminal_coding_system_internal, 1, 1, 0,
7376 doc: /* Internal use only. */)
7377 (coding_system)
7378 Lisp_Object coding_system;
7379 {
7380 CHECK_SYMBOL (coding_system);
7381 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7382 /* We had better not send unsafe characters to terminal. */
7383 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7384 /* Character composition should be disabled. */
7385 terminal_coding.composing = COMPOSITION_DISABLED;
7386 /* Error notification should be suppressed. */
7387 terminal_coding.suppress_error = 1;
7388 terminal_coding.src_multibyte = 1;
7389 terminal_coding.dst_multibyte = 0;
7390 return Qnil;
7391 }
7392
7393 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7394 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7395 doc: /* Internal use only. */)
7396 (coding_system)
7397 Lisp_Object coding_system;
7398 {
7399 CHECK_SYMBOL (coding_system);
7400 setup_coding_system (Fcheck_coding_system (coding_system),
7401 &safe_terminal_coding);
7402 /* Character composition should be disabled. */
7403 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7404 /* Error notification should be suppressed. */
7405 safe_terminal_coding.suppress_error = 1;
7406 safe_terminal_coding.src_multibyte = 1;
7407 safe_terminal_coding.dst_multibyte = 0;
7408 return Qnil;
7409 }
7410
7411 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7412 Sterminal_coding_system, 0, 0, 0,
7413 doc: /* Return coding system specified for terminal output. */)
7414 ()
7415 {
7416 return terminal_coding.symbol;
7417 }
7418
7419 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7420 Sset_keyboard_coding_system_internal, 1, 1, 0,
7421 doc: /* Internal use only. */)
7422 (coding_system)
7423 Lisp_Object coding_system;
7424 {
7425 CHECK_SYMBOL (coding_system);
7426 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7427 /* Character composition should be disabled. */
7428 keyboard_coding.composing = COMPOSITION_DISABLED;
7429 return Qnil;
7430 }
7431
7432 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7433 Skeyboard_coding_system, 0, 0, 0,
7434 doc: /* Return coding system specified for decoding keyboard input. */)
7435 ()
7436 {
7437 return keyboard_coding.symbol;
7438 }
7439
7440 \f
7441 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7442 Sfind_operation_coding_system, 1, MANY, 0,
7443 doc: /* Choose a coding system for an operation based on the target name.
7444 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7445 DECODING-SYSTEM is the coding system to use for decoding
7446 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7447 for encoding (in case OPERATION does encoding).
7448
7449 The first argument OPERATION specifies an I/O primitive:
7450 For file I/O, `insert-file-contents' or `write-region'.
7451 For process I/O, `call-process', `call-process-region', or `start-process'.
7452 For network I/O, `open-network-stream'.
7453
7454 The remaining arguments should be the same arguments that were passed
7455 to the primitive. Depending on which primitive, one of those arguments
7456 is selected as the TARGET. For example, if OPERATION does file I/O,
7457 whichever argument specifies the file name is TARGET.
7458
7459 TARGET has a meaning which depends on OPERATION:
7460 For file I/O, TARGET is a file name (except for the special case below).
7461 For process I/O, TARGET is a process name.
7462 For network I/O, TARGET is a service name or a port number
7463
7464 This function looks up what specified for TARGET in,
7465 `file-coding-system-alist', `process-coding-system-alist',
7466 or `network-coding-system-alist' depending on OPERATION.
7467 They may specify a coding system, a cons of coding systems,
7468 or a function symbol to call.
7469 In the last case, we call the function with one argument,
7470 which is a list of all the arguments given to this function.
7471
7472 If OPERATION is `insert-file-contents', the argument corresponding to
7473 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7474 file name to look up, and BUFFER is a buffer that already contains the
7475 file (but not yet decoded). If a function is found as above, the
7476 function must pay attention to this format of TARGET.
7477
7478 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7479 (nargs, args)
7480 int nargs;
7481 Lisp_Object *args;
7482 {
7483 Lisp_Object operation, target_idx, target, val;
7484 register Lisp_Object chain;
7485
7486 if (nargs < 2)
7487 error ("Too few arguments");
7488 operation = args[0];
7489 if (!SYMBOLP (operation)
7490 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7491 error ("Invalid first argument");
7492 if (nargs < 1 + XINT (target_idx))
7493 error ("Too few arguments for operation: %s",
7494 SDATA (SYMBOL_NAME (operation)));
7495 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7496 argument to write-region) is string, it must be treated as a
7497 target file name. */
7498 if (EQ (operation, Qwrite_region)
7499 && nargs > 5
7500 && STRINGP (args[5]))
7501 target_idx = make_number (4);
7502 target = args[XINT (target_idx) + 1];
7503 if (!(STRINGP (target)
7504 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7505 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7506 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7507 error ("Invalid argument %d", XINT (target_idx) + 1);
7508 if (CONSP (target))
7509 target = XCAR (target);
7510
7511 chain = ((EQ (operation, Qinsert_file_contents)
7512 || EQ (operation, Qwrite_region))
7513 ? Vfile_coding_system_alist
7514 : (EQ (operation, Qopen_network_stream)
7515 ? Vnetwork_coding_system_alist
7516 : Vprocess_coding_system_alist));
7517 if (NILP (chain))
7518 return Qnil;
7519
7520 for (; CONSP (chain); chain = XCDR (chain))
7521 {
7522 Lisp_Object elt;
7523 elt = XCAR (chain);
7524
7525 if (CONSP (elt)
7526 && ((STRINGP (target)
7527 && STRINGP (XCAR (elt))
7528 && fast_string_match (XCAR (elt), target) >= 0)
7529 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7530 {
7531 val = XCDR (elt);
7532 /* Here, if VAL is both a valid coding system and a valid
7533 function symbol, we return VAL as a coding system. */
7534 if (CONSP (val))
7535 return val;
7536 if (! SYMBOLP (val))
7537 return Qnil;
7538 if (! NILP (Fcoding_system_p (val)))
7539 return Fcons (val, val);
7540 if (! NILP (Ffboundp (val)))
7541 {
7542 val = safe_call1 (val, Flist (nargs, args));
7543 if (CONSP (val))
7544 return val;
7545 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7546 return Fcons (val, val);
7547 }
7548 return Qnil;
7549 }
7550 }
7551 return Qnil;
7552 }
7553
7554 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7555 Supdate_coding_systems_internal, 0, 0, 0,
7556 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7557 When values of any coding categories are changed, you must
7558 call this function. */)
7559 ()
7560 {
7561 int i;
7562
7563 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7564 {
7565 Lisp_Object val;
7566
7567 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7568 if (!NILP (val))
7569 {
7570 if (! coding_system_table[i])
7571 coding_system_table[i] = ((struct coding_system *)
7572 xmalloc (sizeof (struct coding_system)));
7573 setup_coding_system (val, coding_system_table[i]);
7574 }
7575 else if (coding_system_table[i])
7576 {
7577 xfree (coding_system_table[i]);
7578 coding_system_table[i] = NULL;
7579 }
7580 }
7581
7582 return Qnil;
7583 }
7584
7585 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7586 Sset_coding_priority_internal, 0, 0, 0,
7587 doc: /* Update internal database for the current value of `coding-category-list'.
7588 This function is internal use only. */)
7589 ()
7590 {
7591 int i = 0, idx;
7592 Lisp_Object val;
7593
7594 val = Vcoding_category_list;
7595
7596 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7597 {
7598 if (! SYMBOLP (XCAR (val)))
7599 break;
7600 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7601 if (idx >= CODING_CATEGORY_IDX_MAX)
7602 break;
7603 coding_priorities[i++] = (1 << idx);
7604 val = XCDR (val);
7605 }
7606 /* If coding-category-list is valid and contains all coding
7607 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7608 the following code saves Emacs from crashing. */
7609 while (i < CODING_CATEGORY_IDX_MAX)
7610 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7611
7612 return Qnil;
7613 }
7614
7615 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7616 Sdefine_coding_system_internal, 1, 1, 0,
7617 doc: /* Register CODING-SYSTEM as a base coding system.
7618 This function is internal use only. */)
7619 (coding_system)
7620 Lisp_Object coding_system;
7621 {
7622 Lisp_Object safe_chars, slot;
7623
7624 if (NILP (Fcheck_coding_system (coding_system)))
7625 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7626 safe_chars = coding_safe_chars (coding_system);
7627 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7628 error ("No valid safe-chars property for %s",
7629 SDATA (SYMBOL_NAME (coding_system)));
7630 if (EQ (safe_chars, Qt))
7631 {
7632 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7633 XSETCAR (Vcoding_system_safe_chars,
7634 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7635 }
7636 else
7637 {
7638 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7639 if (NILP (slot))
7640 XSETCDR (Vcoding_system_safe_chars,
7641 nconc2 (XCDR (Vcoding_system_safe_chars),
7642 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7643 else
7644 XSETCDR (slot, safe_chars);
7645 }
7646 return Qnil;
7647 }
7648
7649 #endif /* emacs */
7650
7651 \f
7652 /*** 9. Post-amble ***/
7653
7654 void
7655 init_coding_once ()
7656 {
7657 int i;
7658
7659 /* Emacs' internal format specific initialize routine. */
7660 for (i = 0; i <= 0x20; i++)
7661 emacs_code_class[i] = EMACS_control_code;
7662 emacs_code_class[0x0A] = EMACS_linefeed_code;
7663 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7664 for (i = 0x21 ; i < 0x7F; i++)
7665 emacs_code_class[i] = EMACS_ascii_code;
7666 emacs_code_class[0x7F] = EMACS_control_code;
7667 for (i = 0x80; i < 0xFF; i++)
7668 emacs_code_class[i] = EMACS_invalid_code;
7669 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7670 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7671 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7672 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7673
7674 /* ISO2022 specific initialize routine. */
7675 for (i = 0; i < 0x20; i++)
7676 iso_code_class[i] = ISO_control_0;
7677 for (i = 0x21; i < 0x7F; i++)
7678 iso_code_class[i] = ISO_graphic_plane_0;
7679 for (i = 0x80; i < 0xA0; i++)
7680 iso_code_class[i] = ISO_control_1;
7681 for (i = 0xA1; i < 0xFF; i++)
7682 iso_code_class[i] = ISO_graphic_plane_1;
7683 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7684 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7685 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7686 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7687 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7688 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7689 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7690 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7691 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7692 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7693
7694 setup_coding_system (Qnil, &keyboard_coding);
7695 setup_coding_system (Qnil, &terminal_coding);
7696 setup_coding_system (Qnil, &safe_terminal_coding);
7697 setup_coding_system (Qnil, &default_buffer_file_coding);
7698
7699 bzero (coding_system_table, sizeof coding_system_table);
7700
7701 bzero (ascii_skip_code, sizeof ascii_skip_code);
7702 for (i = 0; i < 128; i++)
7703 ascii_skip_code[i] = 1;
7704
7705 #if defined (MSDOS) || defined (WINDOWSNT)
7706 system_eol_type = CODING_EOL_CRLF;
7707 #else
7708 system_eol_type = CODING_EOL_LF;
7709 #endif
7710
7711 inhibit_pre_post_conversion = 0;
7712 }
7713
7714 #ifdef emacs
7715
7716 void
7717 syms_of_coding ()
7718 {
7719 staticpro (&Vcode_conversion_workbuf_name);
7720 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7721
7722 Qtarget_idx = intern ("target-idx");
7723 staticpro (&Qtarget_idx);
7724
7725 Qcoding_system_history = intern ("coding-system-history");
7726 staticpro (&Qcoding_system_history);
7727 Fset (Qcoding_system_history, Qnil);
7728
7729 /* Target FILENAME is the first argument. */
7730 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7731 /* Target FILENAME is the third argument. */
7732 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7733
7734 Qcall_process = intern ("call-process");
7735 staticpro (&Qcall_process);
7736 /* Target PROGRAM is the first argument. */
7737 Fput (Qcall_process, Qtarget_idx, make_number (0));
7738
7739 Qcall_process_region = intern ("call-process-region");
7740 staticpro (&Qcall_process_region);
7741 /* Target PROGRAM is the third argument. */
7742 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7743
7744 Qstart_process = intern ("start-process");
7745 staticpro (&Qstart_process);
7746 /* Target PROGRAM is the third argument. */
7747 Fput (Qstart_process, Qtarget_idx, make_number (2));
7748
7749 Qopen_network_stream = intern ("open-network-stream");
7750 staticpro (&Qopen_network_stream);
7751 /* Target SERVICE is the fourth argument. */
7752 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7753
7754 Qcoding_system = intern ("coding-system");
7755 staticpro (&Qcoding_system);
7756
7757 Qeol_type = intern ("eol-type");
7758 staticpro (&Qeol_type);
7759
7760 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7761 staticpro (&Qbuffer_file_coding_system);
7762
7763 Qpost_read_conversion = intern ("post-read-conversion");
7764 staticpro (&Qpost_read_conversion);
7765
7766 Qpre_write_conversion = intern ("pre-write-conversion");
7767 staticpro (&Qpre_write_conversion);
7768
7769 Qno_conversion = intern ("no-conversion");
7770 staticpro (&Qno_conversion);
7771
7772 Qundecided = intern ("undecided");
7773 staticpro (&Qundecided);
7774
7775 Qcoding_system_p = intern ("coding-system-p");
7776 staticpro (&Qcoding_system_p);
7777
7778 Qcoding_system_error = intern ("coding-system-error");
7779 staticpro (&Qcoding_system_error);
7780
7781 Fput (Qcoding_system_error, Qerror_conditions,
7782 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7783 Fput (Qcoding_system_error, Qerror_message,
7784 build_string ("Invalid coding system"));
7785
7786 Qcoding_category = intern ("coding-category");
7787 staticpro (&Qcoding_category);
7788 Qcoding_category_index = intern ("coding-category-index");
7789 staticpro (&Qcoding_category_index);
7790
7791 Vcoding_category_table
7792 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7793 staticpro (&Vcoding_category_table);
7794 {
7795 int i;
7796 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7797 {
7798 XVECTOR (Vcoding_category_table)->contents[i]
7799 = intern (coding_category_name[i]);
7800 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7801 Qcoding_category_index, make_number (i));
7802 }
7803 }
7804
7805 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7806 staticpro (&Vcoding_system_safe_chars);
7807
7808 Qtranslation_table = intern ("translation-table");
7809 staticpro (&Qtranslation_table);
7810 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7811
7812 Qtranslation_table_id = intern ("translation-table-id");
7813 staticpro (&Qtranslation_table_id);
7814
7815 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7816 staticpro (&Qtranslation_table_for_decode);
7817
7818 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7819 staticpro (&Qtranslation_table_for_encode);
7820
7821 Qsafe_chars = intern ("safe-chars");
7822 staticpro (&Qsafe_chars);
7823
7824 Qchar_coding_system = intern ("char-coding-system");
7825 staticpro (&Qchar_coding_system);
7826
7827 /* Intern this now in case it isn't already done.
7828 Setting this variable twice is harmless.
7829 But don't staticpro it here--that is done in alloc.c. */
7830 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7831 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7832 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7833
7834 Qvalid_codes = intern ("valid-codes");
7835 staticpro (&Qvalid_codes);
7836
7837 Qascii_incompatible = intern ("ascii-incompatible");
7838 staticpro (&Qascii_incompatible);
7839
7840 Qemacs_mule = intern ("emacs-mule");
7841 staticpro (&Qemacs_mule);
7842
7843 Qraw_text = intern ("raw-text");
7844 staticpro (&Qraw_text);
7845
7846 Qutf_8 = intern ("utf-8");
7847 staticpro (&Qutf_8);
7848
7849 Qcoding_system_define_form = intern ("coding-system-define-form");
7850 staticpro (&Qcoding_system_define_form);
7851
7852 defsubr (&Scoding_system_p);
7853 defsubr (&Sread_coding_system);
7854 defsubr (&Sread_non_nil_coding_system);
7855 defsubr (&Scheck_coding_system);
7856 defsubr (&Sdetect_coding_region);
7857 defsubr (&Sdetect_coding_string);
7858 defsubr (&Sfind_coding_systems_region_internal);
7859 defsubr (&Sunencodable_char_position);
7860 defsubr (&Sdecode_coding_region);
7861 defsubr (&Sencode_coding_region);
7862 defsubr (&Sdecode_coding_string);
7863 defsubr (&Sencode_coding_string);
7864 defsubr (&Sdecode_sjis_char);
7865 defsubr (&Sencode_sjis_char);
7866 defsubr (&Sdecode_big5_char);
7867 defsubr (&Sencode_big5_char);
7868 defsubr (&Sset_terminal_coding_system_internal);
7869 defsubr (&Sset_safe_terminal_coding_system_internal);
7870 defsubr (&Sterminal_coding_system);
7871 defsubr (&Sset_keyboard_coding_system_internal);
7872 defsubr (&Skeyboard_coding_system);
7873 defsubr (&Sfind_operation_coding_system);
7874 defsubr (&Supdate_coding_systems_internal);
7875 defsubr (&Sset_coding_priority_internal);
7876 defsubr (&Sdefine_coding_system_internal);
7877
7878 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7879 doc: /* List of coding systems.
7880
7881 Do not alter the value of this variable manually. This variable should be
7882 updated by the functions `make-coding-system' and
7883 `define-coding-system-alias'. */);
7884 Vcoding_system_list = Qnil;
7885
7886 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7887 doc: /* Alist of coding system names.
7888 Each element is one element list of coding system name.
7889 This variable is given to `completing-read' as TABLE argument.
7890
7891 Do not alter the value of this variable manually. This variable should be
7892 updated by the functions `make-coding-system' and
7893 `define-coding-system-alias'. */);
7894 Vcoding_system_alist = Qnil;
7895
7896 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7897 doc: /* List of coding-categories (symbols) ordered by priority.
7898
7899 On detecting a coding system, Emacs tries code detection algorithms
7900 associated with each coding-category one by one in this order. When
7901 one algorithm agrees with a byte sequence of source text, the coding
7902 system bound to the corresponding coding-category is selected.
7903
7904 Don't modify this variable directly, but use `set-coding-priority'. */);
7905 {
7906 int i;
7907
7908 Vcoding_category_list = Qnil;
7909 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7910 Vcoding_category_list
7911 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7912 Vcoding_category_list);
7913 }
7914
7915 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7916 doc: /* Specify the coding system for read operations.
7917 It is useful to bind this variable with `let', but do not set it globally.
7918 If the value is a coding system, it is used for decoding on read operation.
7919 If not, an appropriate element is used from one of the coding system alists:
7920 There are three such tables, `file-coding-system-alist',
7921 `process-coding-system-alist', and `network-coding-system-alist'. */);
7922 Vcoding_system_for_read = Qnil;
7923
7924 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7925 doc: /* Specify the coding system for write operations.
7926 Programs bind this variable with `let', but you should not set it globally.
7927 If the value is a coding system, it is used for encoding of output,
7928 when writing it to a file and when sending it to a file or subprocess.
7929
7930 If this does not specify a coding system, an appropriate element
7931 is used from one of the coding system alists:
7932 There are three such tables, `file-coding-system-alist',
7933 `process-coding-system-alist', and `network-coding-system-alist'.
7934 For output to files, if the above procedure does not specify a coding system,
7935 the value of `buffer-file-coding-system' is used. */);
7936 Vcoding_system_for_write = Qnil;
7937
7938 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7939 doc: /* Coding system used in the latest file or process I/O.
7940 Also set by `encode-coding-region', `decode-coding-region',
7941 `encode-coding-string' and `decode-coding-string'. */);
7942 Vlast_coding_system_used = Qnil;
7943
7944 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7945 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7946 See info node `Coding Systems' and info node `Text and Binary' concerning
7947 such conversion. */);
7948 inhibit_eol_conversion = 0;
7949
7950 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7951 doc: /* Non-nil means process buffer inherits coding system of process output.
7952 Bind it to t if the process output is to be treated as if it were a file
7953 read from some filesystem. */);
7954 inherit_process_coding_system = 0;
7955
7956 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7957 doc: /* Alist to decide a coding system to use for a file I/O operation.
7958 The format is ((PATTERN . VAL) ...),
7959 where PATTERN is a regular expression matching a file name,
7960 VAL is a coding system, a cons of coding systems, or a function symbol.
7961 If VAL is a coding system, it is used for both decoding and encoding
7962 the file contents.
7963 If VAL is a cons of coding systems, the car part is used for decoding,
7964 and the cdr part is used for encoding.
7965 If VAL is a function symbol, the function must return a coding system
7966 or a cons of coding systems which are used as above. The function gets
7967 the arguments with which `find-operation-coding-system' was called.
7968
7969 See also the function `find-operation-coding-system'
7970 and the variable `auto-coding-alist'. */);
7971 Vfile_coding_system_alist = Qnil;
7972
7973 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7974 doc: /* Alist to decide a coding system to use for a process I/O operation.
7975 The format is ((PATTERN . VAL) ...),
7976 where PATTERN is a regular expression matching a program name,
7977 VAL is a coding system, a cons of coding systems, or a function symbol.
7978 If VAL is a coding system, it is used for both decoding what received
7979 from the program and encoding what sent to the program.
7980 If VAL is a cons of coding systems, the car part is used for decoding,
7981 and the cdr part is used for encoding.
7982 If VAL is a function symbol, the function must return a coding system
7983 or a cons of coding systems which are used as above.
7984
7985 See also the function `find-operation-coding-system'. */);
7986 Vprocess_coding_system_alist = Qnil;
7987
7988 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7989 doc: /* Alist to decide a coding system to use for a network I/O operation.
7990 The format is ((PATTERN . VAL) ...),
7991 where PATTERN is a regular expression matching a network service name
7992 or is a port number to connect to,
7993 VAL is a coding system, a cons of coding systems, or a function symbol.
7994 If VAL is a coding system, it is used for both decoding what received
7995 from the network stream and encoding what sent to the network stream.
7996 If VAL is a cons of coding systems, the car part is used for decoding,
7997 and the cdr part is used for encoding.
7998 If VAL is a function symbol, the function must return a coding system
7999 or a cons of coding systems which are used as above.
8000
8001 See also the function `find-operation-coding-system'. */);
8002 Vnetwork_coding_system_alist = Qnil;
8003
8004 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8005 doc: /* Coding system to use with system messages.
8006 Also used for decoding keyboard input on X Window system. */);
8007 Vlocale_coding_system = Qnil;
8008
8009 /* The eol mnemonics are reset in startup.el system-dependently. */
8010 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8011 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8012 eol_mnemonic_unix = build_string (":");
8013
8014 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8015 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8016 eol_mnemonic_dos = build_string ("\\");
8017
8018 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8019 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8020 eol_mnemonic_mac = build_string ("/");
8021
8022 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8023 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8024 eol_mnemonic_undecided = build_string (":");
8025
8026 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8027 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8028 Venable_character_translation = Qt;
8029
8030 DEFVAR_LISP ("standard-translation-table-for-decode",
8031 &Vstandard_translation_table_for_decode,
8032 doc: /* Table for translating characters while decoding. */);
8033 Vstandard_translation_table_for_decode = Qnil;
8034
8035 DEFVAR_LISP ("standard-translation-table-for-encode",
8036 &Vstandard_translation_table_for_encode,
8037 doc: /* Table for translating characters while encoding. */);
8038 Vstandard_translation_table_for_encode = Qnil;
8039
8040 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8041 doc: /* Alist of charsets vs revision numbers.
8042 While encoding, if a charset (car part of an element) is found,
8043 designate it with the escape sequence identifying revision (cdr part of the element). */);
8044 Vcharset_revision_alist = Qnil;
8045
8046 DEFVAR_LISP ("default-process-coding-system",
8047 &Vdefault_process_coding_system,
8048 doc: /* Cons of coding systems used for process I/O by default.
8049 The car part is used for decoding a process output,
8050 the cdr part is used for encoding a text to be sent to a process. */);
8051 Vdefault_process_coding_system = Qnil;
8052
8053 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8054 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8055 This is a vector of length 256.
8056 If Nth element is non-nil, the existence of code N in a file
8057 \(or output of subprocess) doesn't prevent it to be detected as
8058 a coding system of ISO 2022 variant which has a flag
8059 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8060 or reading output of a subprocess.
8061 Only 128th through 159th elements has a meaning. */);
8062 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8063
8064 DEFVAR_LISP ("select-safe-coding-system-function",
8065 &Vselect_safe_coding_system_function,
8066 doc: /* Function to call to select safe coding system for encoding a text.
8067
8068 If set, this function is called to force a user to select a proper
8069 coding system which can encode the text in the case that a default
8070 coding system used in each operation can't encode the text.
8071
8072 The default value is `select-safe-coding-system' (which see). */);
8073 Vselect_safe_coding_system_function = Qnil;
8074
8075 DEFVAR_BOOL ("coding-system-require-warning",
8076 &coding_system_require_warning,
8077 doc: /* Internal use only.
8078 If non-nil, on writing a file, `select-safe-coding-system-function' is
8079 called even if `coding-system-for-write' is non-nil. The command
8080 `universal-coding-system-argument' binds this variable to t temporarily. */);
8081 coding_system_require_warning = 0;
8082
8083
8084 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8085 &inhibit_iso_escape_detection,
8086 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8087
8088 By default, on reading a file, Emacs tries to detect how the text is
8089 encoded. This code detection is sensitive to escape sequences. If
8090 the sequence is valid as ISO2022, the code is determined as one of
8091 the ISO2022 encodings, and the file is decoded by the corresponding
8092 coding system (e.g. `iso-2022-7bit').
8093
8094 However, there may be a case that you want to read escape sequences in
8095 a file as is. In such a case, you can set this variable to non-nil.
8096 Then, as the code detection ignores any escape sequences, no file is
8097 detected as encoded in some ISO2022 encoding. The result is that all
8098 escape sequences become visible in a buffer.
8099
8100 The default value is nil, and it is strongly recommended not to change
8101 it. That is because many Emacs Lisp source files that contain
8102 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8103 in Emacs's distribution, and they won't be decoded correctly on
8104 reading if you suppress escape sequence detection.
8105
8106 The other way to read escape sequences in a file without decoding is
8107 to explicitly specify some coding system that doesn't use ISO2022's
8108 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8109 inhibit_iso_escape_detection = 0;
8110
8111 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8112 doc: /* Char table for translating self-inserting characters.
8113 This is applied to the result of input methods, not their input. See also
8114 `keyboard-translate-table'. */);
8115 Vtranslation_table_for_input = Qnil;
8116 }
8117
8118 char *
8119 emacs_strerror (error_number)
8120 int error_number;
8121 {
8122 char *str;
8123
8124 synchronize_system_messages_locale ();
8125 str = strerror (error_number);
8126
8127 if (! NILP (Vlocale_coding_system))
8128 {
8129 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8130 Vlocale_coding_system,
8131 0);
8132 str = (char *) SDATA (dec);
8133 }
8134
8135 return str;
8136 }
8137
8138 #endif /* emacs */
8139
8140 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8141 (do not change this comment) */