]> code.delx.au - gnu-emacs/blob - src/coding.c
Merged from emacs@sv.gnu.org
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7
8 This file is part of GNU Emacs.
9
10 GNU Emacs is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 GNU Emacs is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with GNU Emacs; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 Boston, MA 02110-1301, USA. */
24
25 /*** TABLE OF CONTENTS ***
26
27 0. General comments
28 1. Preamble
29 2. Emacs' internal format (emacs-mule) handlers
30 3. ISO2022 handlers
31 4. Shift-JIS and BIG5 handlers
32 5. CCL handlers
33 6. End-of-line handlers
34 7. C library functions
35 8. Emacs Lisp library functions
36 9. Post-amble
37
38 */
39
40 /*** 0. General comments ***/
41
42
43 /*** GENERAL NOTE on CODING SYSTEMS ***
44
45 A coding system is an encoding mechanism for one or more character
46 sets. Here's a list of coding systems which Emacs can handle. When
47 we say "decode", it means converting some other coding system to
48 Emacs' internal format (emacs-mule), and when we say "encode",
49 it means converting the coding system emacs-mule to some other
50 coding system.
51
52 0. Emacs' internal format (emacs-mule)
53
54 Emacs itself holds a multi-lingual character in buffers and strings
55 in a special format. Details are described in section 2.
56
57 1. ISO2022
58
59 The most famous coding system for multiple character sets. X's
60 Compound Text, various EUCs (Extended Unix Code), and coding
61 systems used in Internet communication such as ISO-2022-JP are
62 all variants of ISO2022. Details are described in section 3.
63
64 2. SJIS (or Shift-JIS or MS-Kanji-Code)
65
66 A coding system to encode character sets: ASCII, JISX0201, and
67 JISX0208. Widely used for PC's in Japan. Details are described in
68 section 4.
69
70 3. BIG5
71
72 A coding system to encode the character sets ASCII and Big5. Widely
73 used for Chinese (mainly in Taiwan and Hong Kong). Details are
74 described in section 4. In this file, when we write "BIG5"
75 (all uppercase), we mean the coding system, and when we write
76 "Big5" (capitalized), we mean the character set.
77
78 4. Raw text
79
80 A coding system for text containing random 8-bit code. Emacs does
81 no code conversion on such text except for end-of-line format.
82
83 5. Other
84
85 If a user wants to read/write text encoded in a coding system not
86 listed above, he can supply a decoder and an encoder for it as CCL
87 (Code Conversion Language) programs. Emacs executes the CCL program
88 while reading/writing.
89
90 Emacs represents a coding system by a Lisp symbol that has a property
91 `coding-system'. But, before actually using the coding system, the
92 information about it is set in a structure of type `struct
93 coding_system' for rapid processing. See section 6 for more details.
94
95 */
96
97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
98
99 How end-of-line of text is encoded depends on the operating system.
100 For instance, Unix's format is just one byte of `line-feed' code,
101 whereas DOS's format is two-byte sequence of `carriage-return' and
102 `line-feed' codes. MacOS's format is usually one byte of
103 `carriage-return'.
104
105 Since text character encoding and end-of-line encoding are
106 independent, any coding system described above can have any
107 end-of-line format. So Emacs has information about end-of-line
108 format in each coding-system. See section 6 for more details.
109
110 */
111
112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
113
114 These functions check if a text between SRC and SRC_END is encoded
115 in the coding system category XXX. Each returns an integer value in
116 which appropriate flag bits for the category XXX are set. The flag
117 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
118 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
119 of the range 0x80..0x9F are in multibyte form. */
120 #if 0
121 int
122 detect_coding_emacs_mule (src, src_end, multibytep)
123 unsigned char *src, *src_end;
124 int multibytep;
125 {
126 ...
127 }
128 #endif
129
130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
131
132 These functions decode SRC_BYTES length of unibyte text at SOURCE
133 encoded in CODING to Emacs' internal format. The resulting
134 multibyte text goes to a place pointed to by DESTINATION, the length
135 of which should not exceed DST_BYTES.
136
137 These functions set the information about original and decoded texts
138 in the members `produced', `produced_char', `consumed', and
139 `consumed_char' of the structure *CODING. They also set the member
140 `result' to one of CODING_FINISH_XXX indicating how the decoding
141 finished.
142
143 DST_BYTES zero means that the source area and destination area are
144 overlapped, which means that we can produce a decoded text until it
145 reaches the head of the not-yet-decoded source text.
146
147 Below is a template for these functions. */
148 #if 0
149 static void
150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
151 struct coding_system *coding;
152 const unsigned char *source;
153 unsigned char *destination;
154 int src_bytes, dst_bytes;
155 {
156 ...
157 }
158 #endif
159
160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
161
162 These functions encode SRC_BYTES length text at SOURCE from Emacs'
163 internal multibyte format to CODING. The resulting unibyte text
164 goes to a place pointed to by DESTINATION, the length of which
165 should not exceed DST_BYTES.
166
167 These functions set the information about original and encoded texts
168 in the members `produced', `produced_char', `consumed', and
169 `consumed_char' of the structure *CODING. They also set the member
170 `result' to one of CODING_FINISH_XXX indicating how the encoding
171 finished.
172
173 DST_BYTES zero means that the source area and destination area are
174 overlapped, which means that we can produce encoded text until it
175 reaches at the head of the not-yet-encoded source text.
176
177 Below is a template for these functions. */
178 #if 0
179 static void
180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
181 struct coding_system *coding;
182 unsigned char *source, *destination;
183 int src_bytes, dst_bytes;
184 {
185 ...
186 }
187 #endif
188
189 /*** COMMONLY USED MACROS ***/
190
191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
192 get one, two, and three bytes from the source text respectively.
193 If there are not enough bytes in the source, they jump to
194 `label_end_of_loop'. The caller should set variables `coding',
195 `src' and `src_end' to appropriate pointer in advance. These
196 macros are called from decoding routines `decode_coding_XXX', thus
197 it is assumed that the source text is unibyte. */
198
199 #define ONE_MORE_BYTE(c1) \
200 do { \
201 if (src >= src_end) \
202 { \
203 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
204 goto label_end_of_loop; \
205 } \
206 c1 = *src++; \
207 } while (0)
208
209 #define TWO_MORE_BYTES(c1, c2) \
210 do { \
211 if (src + 1 >= src_end) \
212 { \
213 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
214 goto label_end_of_loop; \
215 } \
216 c1 = *src++; \
217 c2 = *src++; \
218 } while (0)
219
220
221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
222 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
223 than SRC_END, return with RET. */
224
225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
226 do { \
227 if (src >= src_end) \
228 { \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 return ret; \
231 } \
232 c1 = *src++; \
233 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
234 c1 = *src++ - 0x20; \
235 } while (0)
236
237 /* Set C to the next character at the source text pointed by `src'.
238 If there are not enough characters in the source, jump to
239 `label_end_of_loop'. The caller should set variables `coding'
240 `src', `src_end', and `translation_table' to appropriate pointers
241 in advance. This macro is used in encoding routines
242 `encode_coding_XXX', thus it assumes that the source text is in
243 multibyte form except for 8-bit characters. 8-bit characters are
244 in multibyte form if coding->src_multibyte is nonzero, else they
245 are represented by a single byte. */
246
247 #define ONE_MORE_CHAR(c) \
248 do { \
249 int len = src_end - src; \
250 int bytes; \
251 if (len <= 0) \
252 { \
253 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
254 goto label_end_of_loop; \
255 } \
256 if (coding->src_multibyte \
257 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
258 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
259 else \
260 c = *src, bytes = 1; \
261 if (!NILP (translation_table)) \
262 c = translate_char (translation_table, c, -1, 0, 0); \
263 src += bytes; \
264 } while (0)
265
266
267 /* Produce a multibyte form of character C to `dst'. Jump to
268 `label_end_of_loop' if there's not enough space at `dst'.
269
270 If we are now in the middle of a composition sequence, the decoded
271 character may be ALTCHAR (for the current composition). In that
272 case, the character goes to coding->cmp_data->data instead of
273 `dst'.
274
275 This macro is used in decoding routines. */
276
277 #define EMIT_CHAR(c) \
278 do { \
279 if (! COMPOSING_P (coding) \
280 || coding->composing == COMPOSITION_RELATIVE \
281 || coding->composing == COMPOSITION_WITH_RULE) \
282 { \
283 int bytes = CHAR_BYTES (c); \
284 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
285 { \
286 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
287 goto label_end_of_loop; \
288 } \
289 dst += CHAR_STRING (c, dst); \
290 coding->produced_char++; \
291 } \
292 \
293 if (COMPOSING_P (coding) \
294 && coding->composing != COMPOSITION_RELATIVE) \
295 { \
296 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
297 coding->composition_rule_follows \
298 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
299 } \
300 } while (0)
301
302
303 #define EMIT_ONE_BYTE(c) \
304 do { \
305 if (dst >= (dst_bytes ? dst_end : src)) \
306 { \
307 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
308 goto label_end_of_loop; \
309 } \
310 *dst++ = c; \
311 } while (0)
312
313 #define EMIT_TWO_BYTES(c1, c2) \
314 do { \
315 if (dst + 2 > (dst_bytes ? dst_end : src)) \
316 { \
317 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
318 goto label_end_of_loop; \
319 } \
320 *dst++ = c1, *dst++ = c2; \
321 } while (0)
322
323 #define EMIT_BYTES(from, to) \
324 do { \
325 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
326 { \
327 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
328 goto label_end_of_loop; \
329 } \
330 while (from < to) \
331 *dst++ = *from++; \
332 } while (0)
333
334 \f
335 /*** 1. Preamble ***/
336
337 #ifdef emacs
338 #include <config.h>
339 #endif
340
341 #include <stdio.h>
342
343 #ifdef emacs
344
345 #include "lisp.h"
346 #include "buffer.h"
347 #include "charset.h"
348 #include "composite.h"
349 #include "ccl.h"
350 #include "coding.h"
351 #include "window.h"
352 #include "intervals.h"
353 #include "frame.h"
354 #include "termhooks.h"
355
356 #else /* not emacs */
357
358 #include "mulelib.h"
359
360 #endif /* not emacs */
361
362 Lisp_Object Qcoding_system, Qeol_type;
363 Lisp_Object Qbuffer_file_coding_system;
364 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
365 Lisp_Object Qno_conversion, Qundecided;
366 Lisp_Object Qcoding_system_history;
367 Lisp_Object Qsafe_chars;
368 Lisp_Object Qvalid_codes;
369 Lisp_Object Qascii_incompatible;
370
371 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
372 Lisp_Object Qcall_process, Qcall_process_region;
373 Lisp_Object Qstart_process, Qopen_network_stream;
374 Lisp_Object Qtarget_idx;
375
376 /* If a symbol has this property, evaluate the value to define the
377 symbol as a coding system. */
378 Lisp_Object Qcoding_system_define_form;
379
380 Lisp_Object Vselect_safe_coding_system_function;
381
382 int coding_system_require_warning;
383
384 /* Mnemonic string for each format of end-of-line. */
385 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
386 /* Mnemonic string to indicate format of end-of-line is not yet
387 decided. */
388 Lisp_Object eol_mnemonic_undecided;
389
390 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
391 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
392 This has an effect only for external encoding (i.e. for output to
393 file and process), not for in-buffer or Lisp string encoding. */
394 int system_eol_type;
395
396 #ifdef emacs
397
398 /* Information about which coding system is safe for which chars.
399 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
400
401 GENERIC-LIST is a list of generic coding systems which can encode
402 any characters.
403
404 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
405 corresponding char table that contains safe chars. */
406 Lisp_Object Vcoding_system_safe_chars;
407
408 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
409
410 Lisp_Object Qcoding_system_p, Qcoding_system_error;
411
412 /* Coding system emacs-mule and raw-text are for converting only
413 end-of-line format. */
414 Lisp_Object Qemacs_mule, Qraw_text;
415
416 Lisp_Object Qutf_8;
417
418 /* Coding-systems are handed between Emacs Lisp programs and C internal
419 routines by the following three variables. */
420 /* Coding-system for reading files and receiving data from process. */
421 Lisp_Object Vcoding_system_for_read;
422 /* Coding-system for writing files and sending data to process. */
423 Lisp_Object Vcoding_system_for_write;
424 /* Coding-system actually used in the latest I/O. */
425 Lisp_Object Vlast_coding_system_used;
426
427 /* A vector of length 256 which contains information about special
428 Latin codes (especially for dealing with Microsoft codes). */
429 Lisp_Object Vlatin_extra_code_table;
430
431 /* Flag to inhibit code conversion of end-of-line format. */
432 int inhibit_eol_conversion;
433
434 /* Flag to inhibit ISO2022 escape sequence detection. */
435 int inhibit_iso_escape_detection;
436
437 /* Flag to make buffer-file-coding-system inherit from process-coding. */
438 int inherit_process_coding_system;
439
440 /* Coding system to be used to encode text for terminal display when
441 terminal coding system is nil. */
442 struct coding_system safe_terminal_coding;
443
444 /* Default coding system to be used to write a file. */
445 struct coding_system default_buffer_file_coding;
446
447 Lisp_Object Vfile_coding_system_alist;
448 Lisp_Object Vprocess_coding_system_alist;
449 Lisp_Object Vnetwork_coding_system_alist;
450
451 Lisp_Object Vlocale_coding_system;
452
453 #endif /* emacs */
454
455 Lisp_Object Qcoding_category, Qcoding_category_index;
456
457 /* List of symbols `coding-category-xxx' ordered by priority. */
458 Lisp_Object Vcoding_category_list;
459
460 /* Table of coding categories (Lisp symbols). */
461 Lisp_Object Vcoding_category_table;
462
463 /* Table of names of symbol for each coding-category. */
464 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
465 "coding-category-emacs-mule",
466 "coding-category-sjis",
467 "coding-category-iso-7",
468 "coding-category-iso-7-tight",
469 "coding-category-iso-8-1",
470 "coding-category-iso-8-2",
471 "coding-category-iso-7-else",
472 "coding-category-iso-8-else",
473 "coding-category-ccl",
474 "coding-category-big5",
475 "coding-category-utf-8",
476 "coding-category-utf-16-be",
477 "coding-category-utf-16-le",
478 "coding-category-raw-text",
479 "coding-category-binary"
480 };
481
482 /* Table of pointers to coding systems corresponding to each coding
483 categories. */
484 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
485
486 /* Table of coding category masks. Nth element is a mask for a coding
487 category of which priority is Nth. */
488 static
489 int coding_priorities[CODING_CATEGORY_IDX_MAX];
490
491 /* Flag to tell if we look up translation table on character code
492 conversion. */
493 Lisp_Object Venable_character_translation;
494 /* Standard translation table to look up on decoding (reading). */
495 Lisp_Object Vstandard_translation_table_for_decode;
496 /* Standard translation table to look up on encoding (writing). */
497 Lisp_Object Vstandard_translation_table_for_encode;
498
499 Lisp_Object Qtranslation_table;
500 Lisp_Object Qtranslation_table_id;
501 Lisp_Object Qtranslation_table_for_decode;
502 Lisp_Object Qtranslation_table_for_encode;
503
504 /* Alist of charsets vs revision number. */
505 Lisp_Object Vcharset_revision_alist;
506
507 /* Default coding systems used for process I/O. */
508 Lisp_Object Vdefault_process_coding_system;
509
510 /* Char table for translating Quail and self-inserting input. */
511 Lisp_Object Vtranslation_table_for_input;
512
513 /* Global flag to tell that we can't call post-read-conversion and
514 pre-write-conversion functions. Usually the value is zero, but it
515 is set to 1 temporarily while such functions are running. This is
516 to avoid infinite recursive call. */
517 static int inhibit_pre_post_conversion;
518
519 Lisp_Object Qchar_coding_system;
520
521 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
522 its validity. */
523
524 Lisp_Object
525 coding_safe_chars (coding_system)
526 Lisp_Object coding_system;
527 {
528 Lisp_Object coding_spec, plist, safe_chars;
529
530 coding_spec = Fget (coding_system, Qcoding_system);
531 plist = XVECTOR (coding_spec)->contents[3];
532 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
533 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
534 }
535
536 #define CODING_SAFE_CHAR_P(safe_chars, c) \
537 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
538
539 \f
540 /*** 2. Emacs internal format (emacs-mule) handlers ***/
541
542 /* Emacs' internal format for representation of multiple character
543 sets is a kind of multi-byte encoding, i.e. characters are
544 represented by variable-length sequences of one-byte codes.
545
546 ASCII characters and control characters (e.g. `tab', `newline') are
547 represented by one-byte sequences which are their ASCII codes, in
548 the range 0x00 through 0x7F.
549
550 8-bit characters of the range 0x80..0x9F are represented by
551 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
552 code + 0x20).
553
554 8-bit characters of the range 0xA0..0xFF are represented by
555 one-byte sequences which are their 8-bit code.
556
557 The other characters are represented by a sequence of `base
558 leading-code', optional `extended leading-code', and one or two
559 `position-code's. The length of the sequence is determined by the
560 base leading-code. Leading-code takes the range 0x81 through 0x9D,
561 whereas extended leading-code and position-code take the range 0xA0
562 through 0xFF. See `charset.h' for more details about leading-code
563 and position-code.
564
565 --- CODE RANGE of Emacs' internal format ---
566 character set range
567 ------------- -----
568 ascii 0x00..0x7F
569 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
570 eight-bit-graphic 0xA0..0xBF
571 ELSE 0x81..0x9D + [0xA0..0xFF]+
572 ---------------------------------------------
573
574 As this is the internal character representation, the format is
575 usually not used externally (i.e. in a file or in a data sent to a
576 process). But, it is possible to have a text externally in this
577 format (i.e. by encoding by the coding system `emacs-mule').
578
579 In that case, a sequence of one-byte codes has a slightly different
580 form.
581
582 Firstly, all characters in eight-bit-control are represented by
583 one-byte sequences which are their 8-bit code.
584
585 Next, character composition data are represented by the byte
586 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
587 where,
588 METHOD is 0xF0 plus one of composition method (enum
589 composition_method),
590
591 BYTES is 0xA0 plus the byte length of these composition data,
592
593 CHARS is 0xA0 plus the number of characters composed by these
594 data,
595
596 COMPONENTs are characters of multibyte form or composition
597 rules encoded by two-byte of ASCII codes.
598
599 In addition, for backward compatibility, the following formats are
600 also recognized as composition data on decoding.
601
602 0x80 MSEQ ...
603 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
604
605 Here,
606 MSEQ is a multibyte form but in these special format:
607 ASCII: 0xA0 ASCII_CODE+0x80,
608 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
609 RULE is a one byte code of the range 0xA0..0xF0 that
610 represents a composition rule.
611 */
612
613 enum emacs_code_class_type emacs_code_class[256];
614
615 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
616 Check if a text is encoded in Emacs' internal format. If it is,
617 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
618
619 static int
620 detect_coding_emacs_mule (src, src_end, multibytep)
621 unsigned char *src, *src_end;
622 int multibytep;
623 {
624 unsigned char c;
625 int composing = 0;
626 /* Dummy for ONE_MORE_BYTE. */
627 struct coding_system dummy_coding;
628 struct coding_system *coding = &dummy_coding;
629
630 while (1)
631 {
632 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
633 CODING_CATEGORY_MASK_EMACS_MULE);
634 if (composing)
635 {
636 if (c < 0xA0)
637 composing = 0;
638 else if (c == 0xA0)
639 {
640 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
641 c &= 0x7F;
642 }
643 else
644 c -= 0x20;
645 }
646
647 if (c < 0x20)
648 {
649 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
650 return 0;
651 }
652 else if (c >= 0x80 && c < 0xA0)
653 {
654 if (c == 0x80)
655 /* Old leading code for a composite character. */
656 composing = 1;
657 else
658 {
659 unsigned char *src_base = src - 1;
660 int bytes;
661
662 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
663 bytes))
664 return 0;
665 src = src_base + bytes;
666 }
667 }
668 }
669 }
670
671
672 /* Record the starting position START and METHOD of one composition. */
673
674 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
675 do { \
676 struct composition_data *cmp_data = coding->cmp_data; \
677 int *data = cmp_data->data + cmp_data->used; \
678 coding->cmp_data_start = cmp_data->used; \
679 data[0] = -1; \
680 data[1] = cmp_data->char_offset + start; \
681 data[3] = (int) method; \
682 cmp_data->used += 4; \
683 } while (0)
684
685 /* Record the ending position END of the current composition. */
686
687 #define CODING_ADD_COMPOSITION_END(coding, end) \
688 do { \
689 struct composition_data *cmp_data = coding->cmp_data; \
690 int *data = cmp_data->data + coding->cmp_data_start; \
691 data[0] = cmp_data->used - coding->cmp_data_start; \
692 data[2] = cmp_data->char_offset + end; \
693 } while (0)
694
695 /* Record one COMPONENT (alternate character or composition rule). */
696
697 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
698 do { \
699 coding->cmp_data->data[coding->cmp_data->used++] = component; \
700 if (coding->cmp_data->used - coding->cmp_data_start \
701 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
702 { \
703 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
704 coding->composing = COMPOSITION_NO; \
705 } \
706 } while (0)
707
708
709 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
710 is not less than SRC_END, return -1 without incrementing Src. */
711
712 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
713
714
715 /* Decode a character represented as a component of composition
716 sequence of Emacs 20 style at SRC. Set C to that character, store
717 its multibyte form sequence at P, and set P to the end of that
718 sequence. If no valid character is found, set C to -1. */
719
720 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
721 do { \
722 int bytes; \
723 \
724 c = SAFE_ONE_MORE_BYTE (); \
725 if (c < 0) \
726 break; \
727 if (CHAR_HEAD_P (c)) \
728 c = -1; \
729 else if (c == 0xA0) \
730 { \
731 c = SAFE_ONE_MORE_BYTE (); \
732 if (c < 0xA0) \
733 c = -1; \
734 else \
735 { \
736 c -= 0x80; \
737 *p++ = c; \
738 } \
739 } \
740 else if (BASE_LEADING_CODE_P (c - 0x20)) \
741 { \
742 unsigned char *p0 = p; \
743 \
744 c -= 0x20; \
745 *p++ = c; \
746 bytes = BYTES_BY_CHAR_HEAD (c); \
747 while (--bytes) \
748 { \
749 c = SAFE_ONE_MORE_BYTE (); \
750 if (c < 0) \
751 break; \
752 *p++ = c; \
753 } \
754 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
755 || (coding->flags /* We are recovering a file. */ \
756 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
757 && ! CHAR_HEAD_P (p0[1]))) \
758 c = STRING_CHAR (p0, bytes); \
759 else \
760 c = -1; \
761 } \
762 else \
763 c = -1; \
764 } while (0)
765
766
767 /* Decode a composition rule represented as a component of composition
768 sequence of Emacs 20 style at SRC. Set C to the rule. If not
769 valid rule is found, set C to -1. */
770
771 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
772 do { \
773 c = SAFE_ONE_MORE_BYTE (); \
774 c -= 0xA0; \
775 if (c < 0 || c >= 81) \
776 c = -1; \
777 else \
778 { \
779 gref = c / 9, nref = c % 9; \
780 c = COMPOSITION_ENCODE_RULE (gref, nref); \
781 } \
782 } while (0)
783
784
785 /* Decode composition sequence encoded by `emacs-mule' at the source
786 pointed by SRC. SRC_END is the end of source. Store information
787 of the composition in CODING->cmp_data.
788
789 For backward compatibility, decode also a composition sequence of
790 Emacs 20 style. In that case, the composition sequence contains
791 characters that should be extracted into a buffer or string. Store
792 those characters at *DESTINATION in multibyte form.
793
794 If we encounter an invalid byte sequence, return 0.
795 If we encounter an insufficient source or destination, or
796 insufficient space in CODING->cmp_data, return 1.
797 Otherwise, return consumed bytes in the source.
798
799 */
800 static INLINE int
801 decode_composition_emacs_mule (coding, src, src_end,
802 destination, dst_end, dst_bytes)
803 struct coding_system *coding;
804 const unsigned char *src, *src_end;
805 unsigned char **destination, *dst_end;
806 int dst_bytes;
807 {
808 unsigned char *dst = *destination;
809 int method, data_len, nchars;
810 const unsigned char *src_base = src++;
811 /* Store components of composition. */
812 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
813 int ncomponent;
814 /* Store multibyte form of characters to be composed. This is for
815 Emacs 20 style composition sequence. */
816 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
817 unsigned char *bufp = buf;
818 int c, i, gref, nref;
819
820 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
821 >= COMPOSITION_DATA_SIZE)
822 {
823 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
824 return -1;
825 }
826
827 ONE_MORE_BYTE (c);
828 if (c - 0xF0 >= COMPOSITION_RELATIVE
829 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
830 {
831 int with_rule;
832
833 method = c - 0xF0;
834 with_rule = (method == COMPOSITION_WITH_RULE
835 || method == COMPOSITION_WITH_RULE_ALTCHARS);
836 ONE_MORE_BYTE (c);
837 data_len = c - 0xA0;
838 if (data_len < 4
839 || src_base + data_len > src_end)
840 return 0;
841 ONE_MORE_BYTE (c);
842 nchars = c - 0xA0;
843 if (c < 1)
844 return 0;
845 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
846 {
847 /* If it is longer than this, it can't be valid. */
848 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
849 return 0;
850
851 if (ncomponent % 2 && with_rule)
852 {
853 ONE_MORE_BYTE (gref);
854 gref -= 32;
855 ONE_MORE_BYTE (nref);
856 nref -= 32;
857 c = COMPOSITION_ENCODE_RULE (gref, nref);
858 }
859 else
860 {
861 int bytes;
862 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
863 || (coding->flags /* We are recovering a file. */
864 && src[0] == LEADING_CODE_8_BIT_CONTROL
865 && ! CHAR_HEAD_P (src[1])))
866 c = STRING_CHAR (src, bytes);
867 else
868 c = *src, bytes = 1;
869 src += bytes;
870 }
871 component[ncomponent] = c;
872 }
873 }
874 else if (c >= 0x80)
875 {
876 /* This may be an old Emacs 20 style format. See the comment at
877 the section 2 of this file. */
878 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
879 if (src == src_end
880 && !(coding->mode & CODING_MODE_LAST_BLOCK))
881 goto label_end_of_loop;
882
883 src_end = src;
884 src = src_base + 1;
885 if (c < 0xC0)
886 {
887 method = COMPOSITION_RELATIVE;
888 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
889 {
890 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
891 if (c < 0)
892 break;
893 component[ncomponent++] = c;
894 }
895 if (ncomponent < 2)
896 return 0;
897 nchars = ncomponent;
898 }
899 else if (c == 0xFF)
900 {
901 method = COMPOSITION_WITH_RULE;
902 src++;
903 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
904 if (c < 0)
905 return 0;
906 component[0] = c;
907 for (ncomponent = 1;
908 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
909 {
910 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
911 if (c < 0)
912 break;
913 component[ncomponent++] = c;
914 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
915 if (c < 0)
916 break;
917 component[ncomponent++] = c;
918 }
919 if (ncomponent < 3)
920 return 0;
921 nchars = (ncomponent + 1) / 2;
922 }
923 else
924 return 0;
925 }
926 else
927 return 0;
928
929 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
930 {
931 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
932 for (i = 0; i < ncomponent; i++)
933 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
934 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
935 if (buf < bufp)
936 {
937 unsigned char *p = buf;
938 EMIT_BYTES (p, bufp);
939 *destination += bufp - buf;
940 coding->produced_char += nchars;
941 }
942 return (src - src_base);
943 }
944 label_end_of_loop:
945 return -1;
946 }
947
948 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
949
950 static void
951 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
952 struct coding_system *coding;
953 const unsigned char *source;
954 unsigned char *destination;
955 int src_bytes, dst_bytes;
956 {
957 const unsigned char *src = source;
958 const unsigned char *src_end = source + src_bytes;
959 unsigned char *dst = destination;
960 unsigned char *dst_end = destination + dst_bytes;
961 /* SRC_BASE remembers the start position in source in each loop.
962 The loop will be exited when there's not enough source code, or
963 when there's not enough destination area to produce a
964 character. */
965 const unsigned char *src_base;
966
967 coding->produced_char = 0;
968 while ((src_base = src) < src_end)
969 {
970 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
971 const unsigned char *p;
972 int bytes;
973
974 if (*src == '\r')
975 {
976 int c = *src++;
977
978 if (coding->eol_type == CODING_EOL_CR)
979 c = '\n';
980 else if (coding->eol_type == CODING_EOL_CRLF)
981 {
982 ONE_MORE_BYTE (c);
983 if (c != '\n')
984 {
985 src--;
986 c = '\r';
987 }
988 }
989 *dst++ = c;
990 coding->produced_char++;
991 continue;
992 }
993 else if (*src == '\n')
994 {
995 if ((coding->eol_type == CODING_EOL_CR
996 || coding->eol_type == CODING_EOL_CRLF)
997 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
998 {
999 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1000 goto label_end_of_loop;
1001 }
1002 *dst++ = *src++;
1003 coding->produced_char++;
1004 continue;
1005 }
1006 else if (*src == 0x80 && coding->cmp_data)
1007 {
1008 /* Start of composition data. */
1009 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1010 &dst, dst_end,
1011 dst_bytes);
1012 if (consumed < 0)
1013 goto label_end_of_loop;
1014 else if (consumed > 0)
1015 {
1016 src += consumed;
1017 continue;
1018 }
1019 bytes = CHAR_STRING (*src, tmp);
1020 p = tmp;
1021 src++;
1022 }
1023 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1024 || (coding->flags /* We are recovering a file. */
1025 && src[0] == LEADING_CODE_8_BIT_CONTROL
1026 && ! CHAR_HEAD_P (src[1])))
1027 {
1028 p = src;
1029 src += bytes;
1030 }
1031 else
1032 {
1033 int i, c;
1034
1035 bytes = BYTES_BY_CHAR_HEAD (*src);
1036 src++;
1037 for (i = 1; i < bytes; i++)
1038 {
1039 ONE_MORE_BYTE (c);
1040 if (CHAR_HEAD_P (c))
1041 break;
1042 }
1043 if (i < bytes)
1044 {
1045 bytes = CHAR_STRING (*src_base, tmp);
1046 p = tmp;
1047 src = src_base + 1;
1048 }
1049 else
1050 {
1051 p = src_base;
1052 }
1053 }
1054 if (dst + bytes >= (dst_bytes ? dst_end : src))
1055 {
1056 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1057 break;
1058 }
1059 while (bytes--) *dst++ = *p++;
1060 coding->produced_char++;
1061 }
1062 label_end_of_loop:
1063 coding->consumed = coding->consumed_char = src_base - source;
1064 coding->produced = dst - destination;
1065 }
1066
1067
1068 /* Encode composition data stored at DATA into a special byte sequence
1069 starting by 0x80. Update CODING->cmp_data_start and maybe
1070 CODING->cmp_data for the next call. */
1071
1072 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1073 do { \
1074 unsigned char buf[1024], *p0 = buf, *p; \
1075 int len = data[0]; \
1076 int i; \
1077 \
1078 buf[0] = 0x80; \
1079 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1080 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1081 p = buf + 4; \
1082 if (data[3] == COMPOSITION_WITH_RULE \
1083 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1084 { \
1085 p += CHAR_STRING (data[4], p); \
1086 for (i = 5; i < len; i += 2) \
1087 { \
1088 int gref, nref; \
1089 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1090 *p++ = 0x20 + gref; \
1091 *p++ = 0x20 + nref; \
1092 p += CHAR_STRING (data[i + 1], p); \
1093 } \
1094 } \
1095 else \
1096 { \
1097 for (i = 4; i < len; i++) \
1098 p += CHAR_STRING (data[i], p); \
1099 } \
1100 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1101 \
1102 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1103 { \
1104 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1105 goto label_end_of_loop; \
1106 } \
1107 while (p0 < p) \
1108 *dst++ = *p0++; \
1109 coding->cmp_data_start += data[0]; \
1110 if (coding->cmp_data_start == coding->cmp_data->used \
1111 && coding->cmp_data->next) \
1112 { \
1113 coding->cmp_data = coding->cmp_data->next; \
1114 coding->cmp_data_start = 0; \
1115 } \
1116 } while (0)
1117
1118
1119 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1120 unsigned char *, int, int));
1121
1122 static void
1123 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1124 struct coding_system *coding;
1125 const unsigned char *source;
1126 unsigned char *destination;
1127 int src_bytes, dst_bytes;
1128 {
1129 const unsigned char *src = source;
1130 const unsigned char *src_end = source + src_bytes;
1131 unsigned char *dst = destination;
1132 unsigned char *dst_end = destination + dst_bytes;
1133 const unsigned char *src_base;
1134 int c;
1135 int char_offset;
1136 int *data;
1137
1138 Lisp_Object translation_table;
1139
1140 translation_table = Qnil;
1141
1142 /* Optimization for the case that there's no composition. */
1143 if (!coding->cmp_data || coding->cmp_data->used == 0)
1144 {
1145 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1146 return;
1147 }
1148
1149 char_offset = coding->cmp_data->char_offset;
1150 data = coding->cmp_data->data + coding->cmp_data_start;
1151 while (1)
1152 {
1153 src_base = src;
1154
1155 /* If SRC starts a composition, encode the information about the
1156 composition in advance. */
1157 if (coding->cmp_data_start < coding->cmp_data->used
1158 && char_offset + coding->consumed_char == data[1])
1159 {
1160 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1161 char_offset = coding->cmp_data->char_offset;
1162 data = coding->cmp_data->data + coding->cmp_data_start;
1163 }
1164
1165 ONE_MORE_CHAR (c);
1166 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1167 || coding->eol_type == CODING_EOL_CR))
1168 {
1169 if (coding->eol_type == CODING_EOL_CRLF)
1170 EMIT_TWO_BYTES ('\r', c);
1171 else
1172 EMIT_ONE_BYTE ('\r');
1173 }
1174 else if (SINGLE_BYTE_CHAR_P (c))
1175 {
1176 if (coding->flags && ! ASCII_BYTE_P (c))
1177 {
1178 /* As we are auto saving, retain the multibyte form for
1179 8-bit chars. */
1180 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1181 int bytes = CHAR_STRING (c, buf);
1182
1183 if (bytes == 1)
1184 EMIT_ONE_BYTE (buf[0]);
1185 else
1186 EMIT_TWO_BYTES (buf[0], buf[1]);
1187 }
1188 else
1189 EMIT_ONE_BYTE (c);
1190 }
1191 else
1192 EMIT_BYTES (src_base, src);
1193 coding->consumed_char++;
1194 }
1195 label_end_of_loop:
1196 coding->consumed = src_base - source;
1197 coding->produced = coding->produced_char = dst - destination;
1198 return;
1199 }
1200
1201 \f
1202 /*** 3. ISO2022 handlers ***/
1203
1204 /* The following note describes the coding system ISO2022 briefly.
1205 Since the intention of this note is to help understand the
1206 functions in this file, some parts are NOT ACCURATE or are OVERLY
1207 SIMPLIFIED. For thorough understanding, please refer to the
1208 original document of ISO2022. This is equivalent to the standard
1209 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1210
1211 ISO2022 provides many mechanisms to encode several character sets
1212 in 7-bit and 8-bit environments. For 7-bit environments, all text
1213 is encoded using bytes less than 128. This may make the encoded
1214 text a little bit longer, but the text passes more easily through
1215 several types of gateway, some of which strip off the MSB (Most
1216 Significant Bit).
1217
1218 There are two kinds of character sets: control character sets and
1219 graphic character sets. The former contain control characters such
1220 as `newline' and `escape' to provide control functions (control
1221 functions are also provided by escape sequences). The latter
1222 contain graphic characters such as 'A' and '-'. Emacs recognizes
1223 two control character sets and many graphic character sets.
1224
1225 Graphic character sets are classified into one of the following
1226 four classes, according to the number of bytes (DIMENSION) and
1227 number of characters in one dimension (CHARS) of the set:
1228 - DIMENSION1_CHARS94
1229 - DIMENSION1_CHARS96
1230 - DIMENSION2_CHARS94
1231 - DIMENSION2_CHARS96
1232
1233 In addition, each character set is assigned an identification tag,
1234 unique for each set, called the "final character" (denoted as <F>
1235 hereafter). The <F> of each character set is decided by ECMA(*)
1236 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1237 (0x30..0x3F are for private use only).
1238
1239 Note (*): ECMA = European Computer Manufacturers Association
1240
1241 Here are examples of graphic character sets [NAME(<F>)]:
1242 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1243 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1244 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1245 o DIMENSION2_CHARS96 -- none for the moment
1246
1247 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1248 C0 [0x00..0x1F] -- control character plane 0
1249 GL [0x20..0x7F] -- graphic character plane 0
1250 C1 [0x80..0x9F] -- control character plane 1
1251 GR [0xA0..0xFF] -- graphic character plane 1
1252
1253 A control character set is directly designated and invoked to C0 or
1254 C1 by an escape sequence. The most common case is that:
1255 - ISO646's control character set is designated/invoked to C0, and
1256 - ISO6429's control character set is designated/invoked to C1,
1257 and usually these designations/invocations are omitted in encoded
1258 text. In a 7-bit environment, only C0 can be used, and a control
1259 character for C1 is encoded by an appropriate escape sequence to
1260 fit into the environment. All control characters for C1 are
1261 defined to have corresponding escape sequences.
1262
1263 A graphic character set is at first designated to one of four
1264 graphic registers (G0 through G3), then these graphic registers are
1265 invoked to GL or GR. These designations and invocations can be
1266 done independently. The most common case is that G0 is invoked to
1267 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1268 these invocations and designations are omitted in encoded text.
1269 In a 7-bit environment, only GL can be used.
1270
1271 When a graphic character set of CHARS94 is invoked to GL, codes
1272 0x20 and 0x7F of the GL area work as control characters SPACE and
1273 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1274 be used.
1275
1276 There are two ways of invocation: locking-shift and single-shift.
1277 With locking-shift, the invocation lasts until the next different
1278 invocation, whereas with single-shift, the invocation affects the
1279 following character only and doesn't affect the locking-shift
1280 state. Invocations are done by the following control characters or
1281 escape sequences:
1282
1283 ----------------------------------------------------------------------
1284 abbrev function cntrl escape seq description
1285 ----------------------------------------------------------------------
1286 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1287 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1288 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1289 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1290 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1291 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1292 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1293 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1294 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1295 ----------------------------------------------------------------------
1296 (*) These are not used by any known coding system.
1297
1298 Control characters for these functions are defined by macros
1299 ISO_CODE_XXX in `coding.h'.
1300
1301 Designations are done by the following escape sequences:
1302 ----------------------------------------------------------------------
1303 escape sequence description
1304 ----------------------------------------------------------------------
1305 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1306 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1307 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1308 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1309 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1310 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1311 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1312 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1313 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1314 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1315 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1316 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1317 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1318 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1319 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1320 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1321 ----------------------------------------------------------------------
1322
1323 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1324 of dimension 1, chars 94, and final character <F>, etc...
1325
1326 Note (*): Although these designations are not allowed in ISO2022,
1327 Emacs accepts them on decoding, and produces them on encoding
1328 CHARS96 character sets in a coding system which is characterized as
1329 7-bit environment, non-locking-shift, and non-single-shift.
1330
1331 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1332 '(' can be omitted. We refer to this as "short-form" hereafter.
1333
1334 Now you may notice that there are a lot of ways of encoding the
1335 same multilingual text in ISO2022. Actually, there exist many
1336 coding systems such as Compound Text (used in X11's inter client
1337 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1338 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1339 localized platforms), and all of these are variants of ISO2022.
1340
1341 In addition to the above, Emacs handles two more kinds of escape
1342 sequences: ISO6429's direction specification and Emacs' private
1343 sequence for specifying character composition.
1344
1345 ISO6429's direction specification takes the following form:
1346 o CSI ']' -- end of the current direction
1347 o CSI '0' ']' -- end of the current direction
1348 o CSI '1' ']' -- start of left-to-right text
1349 o CSI '2' ']' -- start of right-to-left text
1350 The control character CSI (0x9B: control sequence introducer) is
1351 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1352
1353 Character composition specification takes the following form:
1354 o ESC '0' -- start relative composition
1355 o ESC '1' -- end composition
1356 o ESC '2' -- start rule-base composition (*)
1357 o ESC '3' -- start relative composition with alternate chars (**)
1358 o ESC '4' -- start rule-base composition with alternate chars (**)
1359 Since these are not standard escape sequences of any ISO standard,
1360 the use of them with these meanings is restricted to Emacs only.
1361
1362 (*) This form is used only in Emacs 20.5 and older versions,
1363 but the newer versions can safely decode it.
1364 (**) This form is used only in Emacs 21.1 and newer versions,
1365 and the older versions can't decode it.
1366
1367 Here's a list of example usages of these composition escape
1368 sequences (categorized by `enum composition_method').
1369
1370 COMPOSITION_RELATIVE:
1371 ESC 0 CHAR [ CHAR ] ESC 1
1372 COMPOSITION_WITH_RULE:
1373 ESC 2 CHAR [ RULE CHAR ] ESC 1
1374 COMPOSITION_WITH_ALTCHARS:
1375 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1376 COMPOSITION_WITH_RULE_ALTCHARS:
1377 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1378
1379 enum iso_code_class_type iso_code_class[256];
1380
1381 #define CHARSET_OK(idx, charset, c) \
1382 (coding_system_table[idx] \
1383 && (charset == CHARSET_ASCII \
1384 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1385 CODING_SAFE_CHAR_P (safe_chars, c))) \
1386 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1387 charset) \
1388 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1389
1390 #define SHIFT_OUT_OK(idx) \
1391 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1392
1393 #define COMPOSITION_OK(idx) \
1394 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1395
1396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1397 Check if a text is encoded in ISO2022. If it is, return an
1398 integer in which appropriate flag bits any of:
1399 CODING_CATEGORY_MASK_ISO_7
1400 CODING_CATEGORY_MASK_ISO_7_TIGHT
1401 CODING_CATEGORY_MASK_ISO_8_1
1402 CODING_CATEGORY_MASK_ISO_8_2
1403 CODING_CATEGORY_MASK_ISO_7_ELSE
1404 CODING_CATEGORY_MASK_ISO_8_ELSE
1405 are set. If a code which should never appear in ISO2022 is found,
1406 returns 0. */
1407
1408 static int
1409 detect_coding_iso2022 (src, src_end, multibytep)
1410 unsigned char *src, *src_end;
1411 int multibytep;
1412 {
1413 int mask = CODING_CATEGORY_MASK_ISO;
1414 int mask_found = 0;
1415 int reg[4], shift_out = 0, single_shifting = 0;
1416 int c, c1, charset;
1417 /* Dummy for ONE_MORE_BYTE. */
1418 struct coding_system dummy_coding;
1419 struct coding_system *coding = &dummy_coding;
1420 Lisp_Object safe_chars;
1421
1422 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1423 while (mask)
1424 {
1425 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1426 retry:
1427 switch (c)
1428 {
1429 case ISO_CODE_ESC:
1430 if (inhibit_iso_escape_detection)
1431 break;
1432 single_shifting = 0;
1433 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1434 if (c >= '(' && c <= '/')
1435 {
1436 /* Designation sequence for a charset of dimension 1. */
1437 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1438 if (c1 < ' ' || c1 >= 0x80
1439 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1440 /* Invalid designation sequence. Just ignore. */
1441 break;
1442 reg[(c - '(') % 4] = charset;
1443 }
1444 else if (c == '$')
1445 {
1446 /* Designation sequence for a charset of dimension 2. */
1447 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1448 if (c >= '@' && c <= 'B')
1449 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1450 reg[0] = charset = iso_charset_table[1][0][c];
1451 else if (c >= '(' && c <= '/')
1452 {
1453 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1454 mask & mask_found);
1455 if (c1 < ' ' || c1 >= 0x80
1456 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1457 /* Invalid designation sequence. Just ignore. */
1458 break;
1459 reg[(c - '(') % 4] = charset;
1460 }
1461 else
1462 /* Invalid designation sequence. Just ignore. */
1463 break;
1464 }
1465 else if (c == 'N' || c == 'O')
1466 {
1467 /* ESC <Fe> for SS2 or SS3. */
1468 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1469 break;
1470 }
1471 else if (c >= '0' && c <= '4')
1472 {
1473 /* ESC <Fp> for start/end composition. */
1474 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1475 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1476 else
1477 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1486 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1487 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1488 else
1489 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1490 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1491 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1492 else
1493 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1494 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1495 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1496 else
1497 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1498 break;
1499 }
1500 else
1501 /* Invalid escape sequence. Just ignore. */
1502 break;
1503
1504 /* We found a valid designation sequence for CHARSET. */
1505 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1506 c = MAKE_CHAR (charset, 0, 0);
1507 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1508 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1509 else
1510 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1511 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1512 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1513 else
1514 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1515 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1516 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1517 else
1518 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1519 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1520 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1521 else
1522 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1523 break;
1524
1525 case ISO_CODE_SO:
1526 if (inhibit_iso_escape_detection)
1527 break;
1528 single_shifting = 0;
1529 if (shift_out == 0
1530 && (reg[1] >= 0
1531 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1532 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1533 {
1534 /* Locking shift out. */
1535 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537 }
1538 break;
1539
1540 case ISO_CODE_SI:
1541 if (inhibit_iso_escape_detection)
1542 break;
1543 single_shifting = 0;
1544 if (shift_out == 1)
1545 {
1546 /* Locking shift in. */
1547 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1548 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1549 }
1550 break;
1551
1552 case ISO_CODE_CSI:
1553 single_shifting = 0;
1554 case ISO_CODE_SS2:
1555 case ISO_CODE_SS3:
1556 {
1557 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1558
1559 if (inhibit_iso_escape_detection)
1560 break;
1561 if (c != ISO_CODE_CSI)
1562 {
1563 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1564 & CODING_FLAG_ISO_SINGLE_SHIFT)
1565 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1566 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1567 & CODING_FLAG_ISO_SINGLE_SHIFT)
1568 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1569 single_shifting = 1;
1570 }
1571 if (VECTORP (Vlatin_extra_code_table)
1572 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1573 {
1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1575 & CODING_FLAG_ISO_LATIN_EXTRA)
1576 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1578 & CODING_FLAG_ISO_LATIN_EXTRA)
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1580 }
1581 mask &= newmask;
1582 mask_found |= newmask;
1583 }
1584 break;
1585
1586 default:
1587 if (c < 0x80)
1588 {
1589 single_shifting = 0;
1590 break;
1591 }
1592 else if (c < 0xA0)
1593 {
1594 single_shifting = 0;
1595 if (VECTORP (Vlatin_extra_code_table)
1596 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1597 {
1598 int newmask = 0;
1599
1600 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1601 & CODING_FLAG_ISO_LATIN_EXTRA)
1602 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1604 & CODING_FLAG_ISO_LATIN_EXTRA)
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1606 mask &= newmask;
1607 mask_found |= newmask;
1608 }
1609 else
1610 return 0;
1611 }
1612 else
1613 {
1614 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1615 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1616 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1617 /* Check the length of succeeding codes of the range
1618 0xA0..0FF. If the byte length is odd, we exclude
1619 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1620 when we are not single shifting. */
1621 if (!single_shifting
1622 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1623 {
1624 int i = 1;
1625
1626 c = -1;
1627 while (src < src_end)
1628 {
1629 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1630 mask & mask_found);
1631 if (c < 0xA0)
1632 break;
1633 i++;
1634 }
1635
1636 if (i & 1 && src < src_end)
1637 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1638 else
1639 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1640 if (c >= 0)
1641 /* This means that we have read one extra byte. */
1642 goto retry;
1643 }
1644 }
1645 break;
1646 }
1647 }
1648 return (mask & mask_found);
1649 }
1650
1651 /* Decode a character of which charset is CHARSET, the 1st position
1652 code is C1, the 2nd position code is C2, and return the decoded
1653 character code. If the variable `translation_table' is non-nil,
1654 returned the translated code. */
1655
1656 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1657 (NILP (translation_table) \
1658 ? MAKE_CHAR (charset, c1, c2) \
1659 : translate_char (translation_table, -1, charset, c1, c2))
1660
1661 /* Set designation state into CODING. */
1662 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1663 do { \
1664 int charset, c; \
1665 \
1666 if (final_char < '0' || final_char >= 128) \
1667 goto label_invalid_code; \
1668 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1669 make_number (chars), \
1670 make_number (final_char)); \
1671 c = MAKE_CHAR (charset, 0, 0); \
1672 if (charset >= 0 \
1673 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1674 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1675 { \
1676 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1677 && reg == 0 \
1678 && charset == CHARSET_ASCII) \
1679 { \
1680 /* We should insert this designation sequence as is so \
1681 that it is surely written back to a file. */ \
1682 coding->spec.iso2022.last_invalid_designation_register = -1; \
1683 goto label_invalid_code; \
1684 } \
1685 coding->spec.iso2022.last_invalid_designation_register = -1; \
1686 if ((coding->mode & CODING_MODE_DIRECTION) \
1687 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1688 charset = CHARSET_REVERSE_CHARSET (charset); \
1689 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1690 } \
1691 else \
1692 { \
1693 coding->spec.iso2022.last_invalid_designation_register = reg; \
1694 goto label_invalid_code; \
1695 } \
1696 } while (0)
1697
1698 /* Allocate a memory block for storing information about compositions.
1699 The block is chained to the already allocated blocks. */
1700
1701 void
1702 coding_allocate_composition_data (coding, char_offset)
1703 struct coding_system *coding;
1704 int char_offset;
1705 {
1706 struct composition_data *cmp_data
1707 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1708
1709 cmp_data->char_offset = char_offset;
1710 cmp_data->used = 0;
1711 cmp_data->prev = coding->cmp_data;
1712 cmp_data->next = NULL;
1713 if (coding->cmp_data)
1714 coding->cmp_data->next = cmp_data;
1715 coding->cmp_data = cmp_data;
1716 coding->cmp_data_start = 0;
1717 coding->composing = COMPOSITION_NO;
1718 }
1719
1720 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1721 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1722 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1723 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1724 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1725 */
1726
1727 #define DECODE_COMPOSITION_START(c1) \
1728 do { \
1729 if (coding->composing == COMPOSITION_DISABLED) \
1730 { \
1731 *dst++ = ISO_CODE_ESC; \
1732 *dst++ = c1 & 0x7f; \
1733 coding->produced_char += 2; \
1734 } \
1735 else if (!COMPOSING_P (coding)) \
1736 { \
1737 /* This is surely the start of a composition. We must be sure \
1738 that coding->cmp_data has enough space to store the \
1739 information about the composition. If not, terminate the \
1740 current decoding loop, allocate one more memory block for \
1741 coding->cmp_data in the caller, then start the decoding \
1742 loop again. We can't allocate memory here directly because \
1743 it may cause buffer/string relocation. */ \
1744 if (!coding->cmp_data \
1745 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1746 >= COMPOSITION_DATA_SIZE)) \
1747 { \
1748 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1749 goto label_end_of_loop; \
1750 } \
1751 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1752 : c1 == '2' ? COMPOSITION_WITH_RULE \
1753 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1754 : COMPOSITION_WITH_RULE_ALTCHARS); \
1755 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1756 coding->composing); \
1757 coding->composition_rule_follows = 0; \
1758 } \
1759 else \
1760 { \
1761 /* We are already handling a composition. If the method is \
1762 the following two, the codes following the current escape \
1763 sequence are actual characters stored in a buffer. */ \
1764 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1765 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1766 { \
1767 coding->composing = COMPOSITION_RELATIVE; \
1768 coding->composition_rule_follows = 0; \
1769 } \
1770 } \
1771 } while (0)
1772
1773 /* Handle composition end sequence ESC 1. */
1774
1775 #define DECODE_COMPOSITION_END(c1) \
1776 do { \
1777 if (! COMPOSING_P (coding)) \
1778 { \
1779 *dst++ = ISO_CODE_ESC; \
1780 *dst++ = c1; \
1781 coding->produced_char += 2; \
1782 } \
1783 else \
1784 { \
1785 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1786 coding->composing = COMPOSITION_NO; \
1787 } \
1788 } while (0)
1789
1790 /* Decode a composition rule from the byte C1 (and maybe one more byte
1791 from SRC) and store one encoded composition rule in
1792 coding->cmp_data. */
1793
1794 #define DECODE_COMPOSITION_RULE(c1) \
1795 do { \
1796 int rule = 0; \
1797 (c1) -= 32; \
1798 if (c1 < 81) /* old format (before ver.21) */ \
1799 { \
1800 int gref = (c1) / 9; \
1801 int nref = (c1) % 9; \
1802 if (gref == 4) gref = 10; \
1803 if (nref == 4) nref = 10; \
1804 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1805 } \
1806 else if (c1 < 93) /* new format (after ver.21) */ \
1807 { \
1808 ONE_MORE_BYTE (c2); \
1809 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1810 } \
1811 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1812 coding->composition_rule_follows = 0; \
1813 } while (0)
1814
1815
1816 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1817
1818 static void
1819 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1820 struct coding_system *coding;
1821 const unsigned char *source;
1822 unsigned char *destination;
1823 int src_bytes, dst_bytes;
1824 {
1825 const unsigned char *src = source;
1826 const unsigned char *src_end = source + src_bytes;
1827 unsigned char *dst = destination;
1828 unsigned char *dst_end = destination + dst_bytes;
1829 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1830 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1831 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1832 /* SRC_BASE remembers the start position in source in each loop.
1833 The loop will be exited when there's not enough source code
1834 (within macro ONE_MORE_BYTE), or when there's not enough
1835 destination area to produce a character (within macro
1836 EMIT_CHAR). */
1837 const unsigned char *src_base;
1838 int c, charset;
1839 Lisp_Object translation_table;
1840 Lisp_Object safe_chars;
1841
1842 safe_chars = coding_safe_chars (coding->symbol);
1843
1844 if (NILP (Venable_character_translation))
1845 translation_table = Qnil;
1846 else
1847 {
1848 translation_table = coding->translation_table_for_decode;
1849 if (NILP (translation_table))
1850 translation_table = Vstandard_translation_table_for_decode;
1851 }
1852
1853 coding->result = CODING_FINISH_NORMAL;
1854
1855 while (1)
1856 {
1857 int c1, c2 = 0;
1858
1859 src_base = src;
1860 ONE_MORE_BYTE (c1);
1861
1862 /* We produce no character or one character. */
1863 switch (iso_code_class [c1])
1864 {
1865 case ISO_0x20_or_0x7F:
1866 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1867 {
1868 DECODE_COMPOSITION_RULE (c1);
1869 continue;
1870 }
1871 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1872 {
1873 /* This is SPACE or DEL. */
1874 charset = CHARSET_ASCII;
1875 break;
1876 }
1877 /* This is a graphic character, we fall down ... */
1878
1879 case ISO_graphic_plane_0:
1880 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1881 {
1882 DECODE_COMPOSITION_RULE (c1);
1883 continue;
1884 }
1885 charset = charset0;
1886 break;
1887
1888 case ISO_0xA0_or_0xFF:
1889 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1890 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1891 goto label_invalid_code;
1892 /* This is a graphic character, we fall down ... */
1893
1894 case ISO_graphic_plane_1:
1895 if (charset1 < 0)
1896 goto label_invalid_code;
1897 charset = charset1;
1898 break;
1899
1900 case ISO_control_0:
1901 if (COMPOSING_P (coding))
1902 DECODE_COMPOSITION_END ('1');
1903
1904 /* All ISO2022 control characters in this class have the
1905 same representation in Emacs internal format. */
1906 if (c1 == '\n'
1907 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1908 && (coding->eol_type == CODING_EOL_CR
1909 || coding->eol_type == CODING_EOL_CRLF))
1910 {
1911 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1912 goto label_end_of_loop;
1913 }
1914 charset = CHARSET_ASCII;
1915 break;
1916
1917 case ISO_control_1:
1918 if (COMPOSING_P (coding))
1919 DECODE_COMPOSITION_END ('1');
1920 goto label_invalid_code;
1921
1922 case ISO_carriage_return:
1923 if (COMPOSING_P (coding))
1924 DECODE_COMPOSITION_END ('1');
1925
1926 if (coding->eol_type == CODING_EOL_CR)
1927 c1 = '\n';
1928 else if (coding->eol_type == CODING_EOL_CRLF)
1929 {
1930 ONE_MORE_BYTE (c1);
1931 if (c1 != ISO_CODE_LF)
1932 {
1933 src--;
1934 c1 = '\r';
1935 }
1936 }
1937 charset = CHARSET_ASCII;
1938 break;
1939
1940 case ISO_shift_out:
1941 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1942 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1943 goto label_invalid_code;
1944 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1945 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1946 continue;
1947
1948 case ISO_shift_in:
1949 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1950 goto label_invalid_code;
1951 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1952 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1953 continue;
1954
1955 case ISO_single_shift_2_7:
1956 case ISO_single_shift_2:
1957 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1958 goto label_invalid_code;
1959 /* SS2 is handled as an escape sequence of ESC 'N' */
1960 c1 = 'N';
1961 goto label_escape_sequence;
1962
1963 case ISO_single_shift_3:
1964 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1965 goto label_invalid_code;
1966 /* SS2 is handled as an escape sequence of ESC 'O' */
1967 c1 = 'O';
1968 goto label_escape_sequence;
1969
1970 case ISO_control_sequence_introducer:
1971 /* CSI is handled as an escape sequence of ESC '[' ... */
1972 c1 = '[';
1973 goto label_escape_sequence;
1974
1975 case ISO_escape:
1976 ONE_MORE_BYTE (c1);
1977 label_escape_sequence:
1978 /* Escape sequences handled by Emacs are invocation,
1979 designation, direction specification, and character
1980 composition specification. */
1981 switch (c1)
1982 {
1983 case '&': /* revision of following character set */
1984 ONE_MORE_BYTE (c1);
1985 if (!(c1 >= '@' && c1 <= '~'))
1986 goto label_invalid_code;
1987 ONE_MORE_BYTE (c1);
1988 if (c1 != ISO_CODE_ESC)
1989 goto label_invalid_code;
1990 ONE_MORE_BYTE (c1);
1991 goto label_escape_sequence;
1992
1993 case '$': /* designation of 2-byte character set */
1994 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1995 goto label_invalid_code;
1996 ONE_MORE_BYTE (c1);
1997 if (c1 >= '@' && c1 <= 'B')
1998 { /* designation of JISX0208.1978, GB2312.1980,
1999 or JISX0208.1980 */
2000 DECODE_DESIGNATION (0, 2, 94, c1);
2001 }
2002 else if (c1 >= 0x28 && c1 <= 0x2B)
2003 { /* designation of DIMENSION2_CHARS94 character set */
2004 ONE_MORE_BYTE (c2);
2005 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2006 }
2007 else if (c1 >= 0x2C && c1 <= 0x2F)
2008 { /* designation of DIMENSION2_CHARS96 character set */
2009 ONE_MORE_BYTE (c2);
2010 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2011 }
2012 else
2013 goto label_invalid_code;
2014 /* We must update these variables now. */
2015 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2016 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2017 continue;
2018
2019 case 'n': /* invocation of locking-shift-2 */
2020 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2021 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2022 goto label_invalid_code;
2023 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2024 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2025 continue;
2026
2027 case 'o': /* invocation of locking-shift-3 */
2028 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2029 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2030 goto label_invalid_code;
2031 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2032 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2033 continue;
2034
2035 case 'N': /* invocation of single-shift-2 */
2036 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2037 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2038 goto label_invalid_code;
2039 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2040 ONE_MORE_BYTE (c1);
2041 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2042 goto label_invalid_code;
2043 break;
2044
2045 case 'O': /* invocation of single-shift-3 */
2046 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2047 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2048 goto label_invalid_code;
2049 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2050 ONE_MORE_BYTE (c1);
2051 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2052 goto label_invalid_code;
2053 break;
2054
2055 case '0': case '2': case '3': case '4': /* start composition */
2056 DECODE_COMPOSITION_START (c1);
2057 continue;
2058
2059 case '1': /* end composition */
2060 DECODE_COMPOSITION_END (c1);
2061 continue;
2062
2063 case '[': /* specification of direction */
2064 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2065 goto label_invalid_code;
2066 /* For the moment, nested direction is not supported.
2067 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2068 left-to-right, and nonzero means right-to-left. */
2069 ONE_MORE_BYTE (c1);
2070 switch (c1)
2071 {
2072 case ']': /* end of the current direction */
2073 coding->mode &= ~CODING_MODE_DIRECTION;
2074
2075 case '0': /* end of the current direction */
2076 case '1': /* start of left-to-right direction */
2077 ONE_MORE_BYTE (c1);
2078 if (c1 == ']')
2079 coding->mode &= ~CODING_MODE_DIRECTION;
2080 else
2081 goto label_invalid_code;
2082 break;
2083
2084 case '2': /* start of right-to-left direction */
2085 ONE_MORE_BYTE (c1);
2086 if (c1 == ']')
2087 coding->mode |= CODING_MODE_DIRECTION;
2088 else
2089 goto label_invalid_code;
2090 break;
2091
2092 default:
2093 goto label_invalid_code;
2094 }
2095 continue;
2096
2097 case '%':
2098 if (COMPOSING_P (coding))
2099 DECODE_COMPOSITION_END ('1');
2100 ONE_MORE_BYTE (c1);
2101 if (c1 == '/')
2102 {
2103 /* CTEXT extended segment:
2104 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2105 We keep these bytes as is for the moment.
2106 They may be decoded by post-read-conversion. */
2107 int dim, M, L;
2108 int size, required;
2109 int produced_chars;
2110
2111 ONE_MORE_BYTE (dim);
2112 ONE_MORE_BYTE (M);
2113 ONE_MORE_BYTE (L);
2114 size = ((M - 128) * 128) + (L - 128);
2115 required = 8 + size * 2;
2116 if (dst + required > (dst_bytes ? dst_end : src))
2117 goto label_end_of_loop;
2118 *dst++ = ISO_CODE_ESC;
2119 *dst++ = '%';
2120 *dst++ = '/';
2121 *dst++ = dim;
2122 produced_chars = 4;
2123 dst += CHAR_STRING (M, dst), produced_chars++;
2124 dst += CHAR_STRING (L, dst), produced_chars++;
2125 while (size-- > 0)
2126 {
2127 ONE_MORE_BYTE (c1);
2128 dst += CHAR_STRING (c1, dst), produced_chars++;
2129 }
2130 coding->produced_char += produced_chars;
2131 }
2132 else if (c1 == 'G')
2133 {
2134 unsigned char *d = dst;
2135 int produced_chars;
2136
2137 /* XFree86 extension for embedding UTF-8 in CTEXT:
2138 ESC % G --UTF-8-BYTES-- ESC % @
2139 We keep these bytes as is for the moment.
2140 They may be decoded by post-read-conversion. */
2141 if (d + 6 > (dst_bytes ? dst_end : src))
2142 goto label_end_of_loop;
2143 *d++ = ISO_CODE_ESC;
2144 *d++ = '%';
2145 *d++ = 'G';
2146 produced_chars = 3;
2147 while (d + 1 < (dst_bytes ? dst_end : src))
2148 {
2149 ONE_MORE_BYTE (c1);
2150 if (c1 == ISO_CODE_ESC
2151 && src + 1 < src_end
2152 && src[0] == '%'
2153 && src[1] == '@')
2154 {
2155 src += 2;
2156 break;
2157 }
2158 d += CHAR_STRING (c1, d), produced_chars++;
2159 }
2160 if (d + 3 > (dst_bytes ? dst_end : src))
2161 goto label_end_of_loop;
2162 *d++ = ISO_CODE_ESC;
2163 *d++ = '%';
2164 *d++ = '@';
2165 dst = d;
2166 coding->produced_char += produced_chars + 3;
2167 }
2168 else
2169 goto label_invalid_code;
2170 continue;
2171
2172 default:
2173 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2174 goto label_invalid_code;
2175 if (c1 >= 0x28 && c1 <= 0x2B)
2176 { /* designation of DIMENSION1_CHARS94 character set */
2177 ONE_MORE_BYTE (c2);
2178 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2179 }
2180 else if (c1 >= 0x2C && c1 <= 0x2F)
2181 { /* designation of DIMENSION1_CHARS96 character set */
2182 ONE_MORE_BYTE (c2);
2183 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2184 }
2185 else
2186 goto label_invalid_code;
2187 /* We must update these variables now. */
2188 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2189 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2190 continue;
2191 }
2192 }
2193
2194 /* Now we know CHARSET and 1st position code C1 of a character.
2195 Produce a multibyte sequence for that character while getting
2196 2nd position code C2 if necessary. */
2197 if (CHARSET_DIMENSION (charset) == 2)
2198 {
2199 ONE_MORE_BYTE (c2);
2200 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2201 /* C2 is not in a valid range. */
2202 goto label_invalid_code;
2203 }
2204 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2205 EMIT_CHAR (c);
2206 continue;
2207
2208 label_invalid_code:
2209 coding->errors++;
2210 if (COMPOSING_P (coding))
2211 DECODE_COMPOSITION_END ('1');
2212 src = src_base;
2213 c = *src++;
2214 if (! NILP (translation_table))
2215 c = translate_char (translation_table, c, 0, 0, 0);
2216 EMIT_CHAR (c);
2217 }
2218
2219 label_end_of_loop:
2220 coding->consumed = coding->consumed_char = src_base - source;
2221 coding->produced = dst - destination;
2222 return;
2223 }
2224
2225
2226 /* ISO2022 encoding stuff. */
2227
2228 /*
2229 It is not enough to say just "ISO2022" on encoding, we have to
2230 specify more details. In Emacs, each ISO2022 coding system
2231 variant has the following specifications:
2232 1. Initial designation to G0 through G3.
2233 2. Allows short-form designation?
2234 3. ASCII should be designated to G0 before control characters?
2235 4. ASCII should be designated to G0 at end of line?
2236 5. 7-bit environment or 8-bit environment?
2237 6. Use locking-shift?
2238 7. Use Single-shift?
2239 And the following two are only for Japanese:
2240 8. Use ASCII in place of JIS0201-1976-Roman?
2241 9. Use JISX0208-1983 in place of JISX0208-1978?
2242 These specifications are encoded in `coding->flags' as flag bits
2243 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2244 details.
2245 */
2246
2247 /* Produce codes (escape sequence) for designating CHARSET to graphic
2248 register REG at DST, and increment DST. If <final-char> of CHARSET is
2249 '@', 'A', or 'B' and the coding system CODING allows, produce
2250 designation sequence of short-form. */
2251
2252 #define ENCODE_DESIGNATION(charset, reg, coding) \
2253 do { \
2254 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2255 char *intermediate_char_94 = "()*+"; \
2256 char *intermediate_char_96 = ",-./"; \
2257 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2258 \
2259 if (revision < 255) \
2260 { \
2261 *dst++ = ISO_CODE_ESC; \
2262 *dst++ = '&'; \
2263 *dst++ = '@' + revision; \
2264 } \
2265 *dst++ = ISO_CODE_ESC; \
2266 if (CHARSET_DIMENSION (charset) == 1) \
2267 { \
2268 if (CHARSET_CHARS (charset) == 94) \
2269 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2270 else \
2271 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2272 } \
2273 else \
2274 { \
2275 *dst++ = '$'; \
2276 if (CHARSET_CHARS (charset) == 94) \
2277 { \
2278 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2279 || reg != 0 \
2280 || final_char < '@' || final_char > 'B') \
2281 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2282 } \
2283 else \
2284 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2285 } \
2286 *dst++ = final_char; \
2287 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2288 } while (0)
2289
2290 /* The following two macros produce codes (control character or escape
2291 sequence) for ISO2022 single-shift functions (single-shift-2 and
2292 single-shift-3). */
2293
2294 #define ENCODE_SINGLE_SHIFT_2 \
2295 do { \
2296 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2297 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2298 else \
2299 *dst++ = ISO_CODE_SS2; \
2300 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2301 } while (0)
2302
2303 #define ENCODE_SINGLE_SHIFT_3 \
2304 do { \
2305 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2306 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2307 else \
2308 *dst++ = ISO_CODE_SS3; \
2309 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2310 } while (0)
2311
2312 /* The following four macros produce codes (control character or
2313 escape sequence) for ISO2022 locking-shift functions (shift-in,
2314 shift-out, locking-shift-2, and locking-shift-3). */
2315
2316 #define ENCODE_SHIFT_IN \
2317 do { \
2318 *dst++ = ISO_CODE_SI; \
2319 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2320 } while (0)
2321
2322 #define ENCODE_SHIFT_OUT \
2323 do { \
2324 *dst++ = ISO_CODE_SO; \
2325 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2326 } while (0)
2327
2328 #define ENCODE_LOCKING_SHIFT_2 \
2329 do { \
2330 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2331 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2332 } while (0)
2333
2334 #define ENCODE_LOCKING_SHIFT_3 \
2335 do { \
2336 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2338 } while (0)
2339
2340 /* Produce codes for a DIMENSION1 character whose character set is
2341 CHARSET and whose position-code is C1. Designation and invocation
2342 sequences are also produced in advance if necessary. */
2343
2344 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2345 do { \
2346 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2347 { \
2348 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2349 *dst++ = c1 & 0x7F; \
2350 else \
2351 *dst++ = c1 | 0x80; \
2352 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2353 break; \
2354 } \
2355 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2356 { \
2357 *dst++ = c1 & 0x7F; \
2358 break; \
2359 } \
2360 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2361 { \
2362 *dst++ = c1 | 0x80; \
2363 break; \
2364 } \
2365 else \
2366 /* Since CHARSET is not yet invoked to any graphic planes, we \
2367 must invoke it, or, at first, designate it to some graphic \
2368 register. Then repeat the loop to actually produce the \
2369 character. */ \
2370 dst = encode_invocation_designation (charset, coding, dst); \
2371 } while (1)
2372
2373 /* Produce codes for a DIMENSION2 character whose character set is
2374 CHARSET and whose position-codes are C1 and C2. Designation and
2375 invocation codes are also produced in advance if necessary. */
2376
2377 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2378 do { \
2379 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2380 { \
2381 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2382 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2383 else \
2384 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2385 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2386 break; \
2387 } \
2388 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2389 { \
2390 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2391 break; \
2392 } \
2393 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2394 { \
2395 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2396 break; \
2397 } \
2398 else \
2399 /* Since CHARSET is not yet invoked to any graphic planes, we \
2400 must invoke it, or, at first, designate it to some graphic \
2401 register. Then repeat the loop to actually produce the \
2402 character. */ \
2403 dst = encode_invocation_designation (charset, coding, dst); \
2404 } while (1)
2405
2406 #define ENCODE_ISO_CHARACTER(c) \
2407 do { \
2408 int charset, c1, c2; \
2409 \
2410 SPLIT_CHAR (c, charset, c1, c2); \
2411 if (CHARSET_DEFINED_P (charset)) \
2412 { \
2413 if (CHARSET_DIMENSION (charset) == 1) \
2414 { \
2415 if (charset == CHARSET_ASCII \
2416 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2417 charset = charset_latin_jisx0201; \
2418 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2419 } \
2420 else \
2421 { \
2422 if (charset == charset_jisx0208 \
2423 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2424 charset = charset_jisx0208_1978; \
2425 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2426 } \
2427 } \
2428 else \
2429 { \
2430 *dst++ = c1; \
2431 if (c2 >= 0) \
2432 *dst++ = c2; \
2433 } \
2434 } while (0)
2435
2436
2437 /* Instead of encoding character C, produce one or two `?'s. */
2438
2439 #define ENCODE_UNSAFE_CHARACTER(c) \
2440 do { \
2441 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2442 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2443 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2444 } while (0)
2445
2446
2447 /* Produce designation and invocation codes at a place pointed by DST
2448 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2449 Return new DST. */
2450
2451 unsigned char *
2452 encode_invocation_designation (charset, coding, dst)
2453 int charset;
2454 struct coding_system *coding;
2455 unsigned char *dst;
2456 {
2457 int reg; /* graphic register number */
2458
2459 /* At first, check designations. */
2460 for (reg = 0; reg < 4; reg++)
2461 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2462 break;
2463
2464 if (reg >= 4)
2465 {
2466 /* CHARSET is not yet designated to any graphic registers. */
2467 /* At first check the requested designation. */
2468 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2469 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2470 /* Since CHARSET requests no special designation, designate it
2471 to graphic register 0. */
2472 reg = 0;
2473
2474 ENCODE_DESIGNATION (charset, reg, coding);
2475 }
2476
2477 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2478 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2479 {
2480 /* Since the graphic register REG is not invoked to any graphic
2481 planes, invoke it to graphic plane 0. */
2482 switch (reg)
2483 {
2484 case 0: /* graphic register 0 */
2485 ENCODE_SHIFT_IN;
2486 break;
2487
2488 case 1: /* graphic register 1 */
2489 ENCODE_SHIFT_OUT;
2490 break;
2491
2492 case 2: /* graphic register 2 */
2493 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2494 ENCODE_SINGLE_SHIFT_2;
2495 else
2496 ENCODE_LOCKING_SHIFT_2;
2497 break;
2498
2499 case 3: /* graphic register 3 */
2500 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2501 ENCODE_SINGLE_SHIFT_3;
2502 else
2503 ENCODE_LOCKING_SHIFT_3;
2504 break;
2505 }
2506 }
2507
2508 return dst;
2509 }
2510
2511 /* Produce 2-byte codes for encoded composition rule RULE. */
2512
2513 #define ENCODE_COMPOSITION_RULE(rule) \
2514 do { \
2515 int gref, nref; \
2516 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2517 *dst++ = 32 + 81 + gref; \
2518 *dst++ = 32 + nref; \
2519 } while (0)
2520
2521 /* Produce codes for indicating the start of a composition sequence
2522 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2523 which specify information about the composition. See the comment
2524 in coding.h for the format of DATA. */
2525
2526 #define ENCODE_COMPOSITION_START(coding, data) \
2527 do { \
2528 coding->composing = data[3]; \
2529 *dst++ = ISO_CODE_ESC; \
2530 if (coding->composing == COMPOSITION_RELATIVE) \
2531 *dst++ = '0'; \
2532 else \
2533 { \
2534 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2535 ? '3' : '4'); \
2536 coding->cmp_data_index = coding->cmp_data_start + 4; \
2537 coding->composition_rule_follows = 0; \
2538 } \
2539 } while (0)
2540
2541 /* Produce codes for indicating the end of the current composition. */
2542
2543 #define ENCODE_COMPOSITION_END(coding, data) \
2544 do { \
2545 *dst++ = ISO_CODE_ESC; \
2546 *dst++ = '1'; \
2547 coding->cmp_data_start += data[0]; \
2548 coding->composing = COMPOSITION_NO; \
2549 if (coding->cmp_data_start == coding->cmp_data->used \
2550 && coding->cmp_data->next) \
2551 { \
2552 coding->cmp_data = coding->cmp_data->next; \
2553 coding->cmp_data_start = 0; \
2554 } \
2555 } while (0)
2556
2557 /* Produce composition start sequence ESC 0. Here, this sequence
2558 doesn't mean the start of a new composition but means that we have
2559 just produced components (alternate chars and composition rules) of
2560 the composition and the actual text follows in SRC. */
2561
2562 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2563 do { \
2564 *dst++ = ISO_CODE_ESC; \
2565 *dst++ = '0'; \
2566 coding->composing = COMPOSITION_RELATIVE; \
2567 } while (0)
2568
2569 /* The following three macros produce codes for indicating direction
2570 of text. */
2571 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2572 do { \
2573 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2574 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2575 else \
2576 *dst++ = ISO_CODE_CSI; \
2577 } while (0)
2578
2579 #define ENCODE_DIRECTION_R2L \
2580 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2581
2582 #define ENCODE_DIRECTION_L2R \
2583 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2584
2585 /* Produce codes for designation and invocation to reset the graphic
2586 planes and registers to initial state. */
2587 #define ENCODE_RESET_PLANE_AND_REGISTER \
2588 do { \
2589 int reg; \
2590 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2591 ENCODE_SHIFT_IN; \
2592 for (reg = 0; reg < 4; reg++) \
2593 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2594 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2595 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2596 ENCODE_DESIGNATION \
2597 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2598 } while (0)
2599
2600 /* Produce designation sequences of charsets in the line started from
2601 SRC to a place pointed by DST, and return updated DST.
2602
2603 If the current block ends before any end-of-line, we may fail to
2604 find all the necessary designations. */
2605
2606 static unsigned char *
2607 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2608 struct coding_system *coding;
2609 Lisp_Object translation_table;
2610 const unsigned char *src, *src_end;
2611 unsigned char *dst;
2612 {
2613 int charset, c, found = 0, reg;
2614 /* Table of charsets to be designated to each graphic register. */
2615 int r[4];
2616
2617 for (reg = 0; reg < 4; reg++)
2618 r[reg] = -1;
2619
2620 while (found < 4)
2621 {
2622 ONE_MORE_CHAR (c);
2623 if (c == '\n')
2624 break;
2625
2626 charset = CHAR_CHARSET (c);
2627 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2628 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2629 {
2630 found++;
2631 r[reg] = charset;
2632 }
2633 }
2634
2635 label_end_of_loop:
2636 if (found)
2637 {
2638 for (reg = 0; reg < 4; reg++)
2639 if (r[reg] >= 0
2640 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2641 ENCODE_DESIGNATION (r[reg], reg, coding);
2642 }
2643
2644 return dst;
2645 }
2646
2647 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2648
2649 static void
2650 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2651 struct coding_system *coding;
2652 const unsigned char *source;
2653 unsigned char *destination;
2654 int src_bytes, dst_bytes;
2655 {
2656 const unsigned char *src = source;
2657 const unsigned char *src_end = source + src_bytes;
2658 unsigned char *dst = destination;
2659 unsigned char *dst_end = destination + dst_bytes;
2660 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2661 from DST_END to assure overflow checking is necessary only at the
2662 head of loop. */
2663 unsigned char *adjusted_dst_end = dst_end - 19;
2664 /* SRC_BASE remembers the start position in source in each loop.
2665 The loop will be exited when there's not enough source text to
2666 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2667 there's not enough destination area to produce encoded codes
2668 (within macro EMIT_BYTES). */
2669 const unsigned char *src_base;
2670 int c;
2671 Lisp_Object translation_table;
2672 Lisp_Object safe_chars;
2673
2674 if (coding->flags & CODING_FLAG_ISO_SAFE)
2675 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2676
2677 safe_chars = coding_safe_chars (coding->symbol);
2678
2679 if (NILP (Venable_character_translation))
2680 translation_table = Qnil;
2681 else
2682 {
2683 translation_table = coding->translation_table_for_encode;
2684 if (NILP (translation_table))
2685 translation_table = Vstandard_translation_table_for_encode;
2686 }
2687
2688 coding->consumed_char = 0;
2689 coding->errors = 0;
2690 while (1)
2691 {
2692 src_base = src;
2693
2694 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2695 {
2696 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2697 break;
2698 }
2699
2700 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2701 && CODING_SPEC_ISO_BOL (coding))
2702 {
2703 /* We have to produce designation sequences if any now. */
2704 dst = encode_designation_at_bol (coding, translation_table,
2705 src, src_end, dst);
2706 CODING_SPEC_ISO_BOL (coding) = 0;
2707 }
2708
2709 /* Check composition start and end. */
2710 if (coding->composing != COMPOSITION_DISABLED
2711 && coding->cmp_data_start < coding->cmp_data->used)
2712 {
2713 struct composition_data *cmp_data = coding->cmp_data;
2714 int *data = cmp_data->data + coding->cmp_data_start;
2715 int this_pos = cmp_data->char_offset + coding->consumed_char;
2716
2717 if (coding->composing == COMPOSITION_RELATIVE)
2718 {
2719 if (this_pos == data[2])
2720 {
2721 ENCODE_COMPOSITION_END (coding, data);
2722 cmp_data = coding->cmp_data;
2723 data = cmp_data->data + coding->cmp_data_start;
2724 }
2725 }
2726 else if (COMPOSING_P (coding))
2727 {
2728 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2729 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2730 /* We have consumed components of the composition.
2731 What follows in SRC is the composition's base
2732 text. */
2733 ENCODE_COMPOSITION_FAKE_START (coding);
2734 else
2735 {
2736 int c = cmp_data->data[coding->cmp_data_index++];
2737 if (coding->composition_rule_follows)
2738 {
2739 ENCODE_COMPOSITION_RULE (c);
2740 coding->composition_rule_follows = 0;
2741 }
2742 else
2743 {
2744 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2745 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2746 ENCODE_UNSAFE_CHARACTER (c);
2747 else
2748 ENCODE_ISO_CHARACTER (c);
2749 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2750 coding->composition_rule_follows = 1;
2751 }
2752 continue;
2753 }
2754 }
2755 if (!COMPOSING_P (coding))
2756 {
2757 if (this_pos == data[1])
2758 {
2759 ENCODE_COMPOSITION_START (coding, data);
2760 continue;
2761 }
2762 }
2763 }
2764
2765 ONE_MORE_CHAR (c);
2766
2767 /* Now encode the character C. */
2768 if (c < 0x20 || c == 0x7F)
2769 {
2770 if (c == '\r')
2771 {
2772 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2773 {
2774 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2775 ENCODE_RESET_PLANE_AND_REGISTER;
2776 *dst++ = c;
2777 continue;
2778 }
2779 /* fall down to treat '\r' as '\n' ... */
2780 c = '\n';
2781 }
2782 if (c == '\n')
2783 {
2784 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2785 ENCODE_RESET_PLANE_AND_REGISTER;
2786 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2787 bcopy (coding->spec.iso2022.initial_designation,
2788 coding->spec.iso2022.current_designation,
2789 sizeof coding->spec.iso2022.initial_designation);
2790 if (coding->eol_type == CODING_EOL_LF
2791 || coding->eol_type == CODING_EOL_UNDECIDED)
2792 *dst++ = ISO_CODE_LF;
2793 else if (coding->eol_type == CODING_EOL_CRLF)
2794 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2795 else
2796 *dst++ = ISO_CODE_CR;
2797 CODING_SPEC_ISO_BOL (coding) = 1;
2798 }
2799 else
2800 {
2801 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2802 ENCODE_RESET_PLANE_AND_REGISTER;
2803 *dst++ = c;
2804 }
2805 }
2806 else if (ASCII_BYTE_P (c))
2807 ENCODE_ISO_CHARACTER (c);
2808 else if (SINGLE_BYTE_CHAR_P (c))
2809 {
2810 *dst++ = c;
2811 coding->errors++;
2812 }
2813 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2814 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2815 ENCODE_UNSAFE_CHARACTER (c);
2816 else
2817 ENCODE_ISO_CHARACTER (c);
2818
2819 coding->consumed_char++;
2820 }
2821
2822 label_end_of_loop:
2823 coding->consumed = src_base - source;
2824 coding->produced = coding->produced_char = dst - destination;
2825 }
2826
2827 \f
2828 /*** 4. SJIS and BIG5 handlers ***/
2829
2830 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2831 quite widely. So, for the moment, Emacs supports them in the bare
2832 C code. But, in the future, they may be supported only by CCL. */
2833
2834 /* SJIS is a coding system encoding three character sets: ASCII, right
2835 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2836 as is. A character of charset katakana-jisx0201 is encoded by
2837 "position-code + 0x80". A character of charset japanese-jisx0208
2838 is encoded in 2-byte but two position-codes are divided and shifted
2839 so that it fits in the range below.
2840
2841 --- CODE RANGE of SJIS ---
2842 (character set) (range)
2843 ASCII 0x00 .. 0x7F
2844 KATAKANA-JISX0201 0xA1 .. 0xDF
2845 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2846 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2847 -------------------------------
2848
2849 */
2850
2851 /* BIG5 is a coding system encoding two character sets: ASCII and
2852 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2853 character set and is encoded in two bytes.
2854
2855 --- CODE RANGE of BIG5 ---
2856 (character set) (range)
2857 ASCII 0x00 .. 0x7F
2858 Big5 (1st byte) 0xA1 .. 0xFE
2859 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2860 --------------------------
2861
2862 Since the number of characters in Big5 is larger than maximum
2863 characters in Emacs' charset (96x96), it can't be handled as one
2864 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2865 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2866 contains frequently used characters and the latter contains less
2867 frequently used characters. */
2868
2869 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2870 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2871 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2872 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2873
2874 /* Number of Big5 characters which have the same code in 1st byte. */
2875 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2876
2877 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2878 do { \
2879 unsigned int temp \
2880 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2881 if (b1 < 0xC9) \
2882 charset = charset_big5_1; \
2883 else \
2884 { \
2885 charset = charset_big5_2; \
2886 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2887 } \
2888 c1 = temp / (0xFF - 0xA1) + 0x21; \
2889 c2 = temp % (0xFF - 0xA1) + 0x21; \
2890 } while (0)
2891
2892 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2893 do { \
2894 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2895 if (charset == charset_big5_2) \
2896 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2897 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2898 b2 = temp % BIG5_SAME_ROW; \
2899 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2900 } while (0)
2901
2902 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2903 Check if a text is encoded in SJIS. If it is, return
2904 CODING_CATEGORY_MASK_SJIS, else return 0. */
2905
2906 static int
2907 detect_coding_sjis (src, src_end, multibytep)
2908 unsigned char *src, *src_end;
2909 int multibytep;
2910 {
2911 int c;
2912 /* Dummy for ONE_MORE_BYTE. */
2913 struct coding_system dummy_coding;
2914 struct coding_system *coding = &dummy_coding;
2915
2916 while (1)
2917 {
2918 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2919 if (c < 0x80)
2920 continue;
2921 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2922 return 0;
2923 if (c <= 0x9F || c >= 0xE0)
2924 {
2925 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2926 if (c < 0x40 || c == 0x7F || c > 0xFC)
2927 return 0;
2928 }
2929 }
2930 }
2931
2932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2933 Check if a text is encoded in BIG5. If it is, return
2934 CODING_CATEGORY_MASK_BIG5, else return 0. */
2935
2936 static int
2937 detect_coding_big5 (src, src_end, multibytep)
2938 unsigned char *src, *src_end;
2939 int multibytep;
2940 {
2941 int c;
2942 /* Dummy for ONE_MORE_BYTE. */
2943 struct coding_system dummy_coding;
2944 struct coding_system *coding = &dummy_coding;
2945
2946 while (1)
2947 {
2948 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2949 if (c < 0x80)
2950 continue;
2951 if (c < 0xA1 || c > 0xFE)
2952 return 0;
2953 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2954 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2955 return 0;
2956 }
2957 }
2958
2959 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2960 Check if a text is encoded in UTF-8. If it is, return
2961 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2962
2963 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2964 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2965 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2966 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2967 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2968 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2969 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2970
2971 static int
2972 detect_coding_utf_8 (src, src_end, multibytep)
2973 unsigned char *src, *src_end;
2974 int multibytep;
2975 {
2976 unsigned char c;
2977 int seq_maybe_bytes;
2978 /* Dummy for ONE_MORE_BYTE. */
2979 struct coding_system dummy_coding;
2980 struct coding_system *coding = &dummy_coding;
2981
2982 while (1)
2983 {
2984 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2985 if (UTF_8_1_OCTET_P (c))
2986 continue;
2987 else if (UTF_8_2_OCTET_LEADING_P (c))
2988 seq_maybe_bytes = 1;
2989 else if (UTF_8_3_OCTET_LEADING_P (c))
2990 seq_maybe_bytes = 2;
2991 else if (UTF_8_4_OCTET_LEADING_P (c))
2992 seq_maybe_bytes = 3;
2993 else if (UTF_8_5_OCTET_LEADING_P (c))
2994 seq_maybe_bytes = 4;
2995 else if (UTF_8_6_OCTET_LEADING_P (c))
2996 seq_maybe_bytes = 5;
2997 else
2998 return 0;
2999
3000 do
3001 {
3002 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3003 if (!UTF_8_EXTRA_OCTET_P (c))
3004 return 0;
3005 seq_maybe_bytes--;
3006 }
3007 while (seq_maybe_bytes > 0);
3008 }
3009 }
3010
3011 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3012 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3013 Little Endian (otherwise). If it is, return
3014 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3015 else return 0. */
3016
3017 #define UTF_16_INVALID_P(val) \
3018 (((val) == 0xFFFE) \
3019 || ((val) == 0xFFFF))
3020
3021 #define UTF_16_HIGH_SURROGATE_P(val) \
3022 (((val) & 0xD800) == 0xD800)
3023
3024 #define UTF_16_LOW_SURROGATE_P(val) \
3025 (((val) & 0xDC00) == 0xDC00)
3026
3027 static int
3028 detect_coding_utf_16 (src, src_end, multibytep)
3029 unsigned char *src, *src_end;
3030 int multibytep;
3031 {
3032 unsigned char c1, c2;
3033 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3034 struct coding_system dummy_coding;
3035 struct coding_system *coding = &dummy_coding;
3036
3037 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3038 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3039
3040 if ((c1 == 0xFF) && (c2 == 0xFE))
3041 return CODING_CATEGORY_MASK_UTF_16_LE;
3042 else if ((c1 == 0xFE) && (c2 == 0xFF))
3043 return CODING_CATEGORY_MASK_UTF_16_BE;
3044 return 0;
3045 }
3046
3047 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3048 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3049
3050 static void
3051 decode_coding_sjis_big5 (coding, source, destination,
3052 src_bytes, dst_bytes, sjis_p)
3053 struct coding_system *coding;
3054 const unsigned char *source;
3055 unsigned char *destination;
3056 int src_bytes, dst_bytes;
3057 int sjis_p;
3058 {
3059 const unsigned char *src = source;
3060 const unsigned char *src_end = source + src_bytes;
3061 unsigned char *dst = destination;
3062 unsigned char *dst_end = destination + dst_bytes;
3063 /* SRC_BASE remembers the start position in source in each loop.
3064 The loop will be exited when there's not enough source code
3065 (within macro ONE_MORE_BYTE), or when there's not enough
3066 destination area to produce a character (within macro
3067 EMIT_CHAR). */
3068 const unsigned char *src_base;
3069 Lisp_Object translation_table;
3070
3071 if (NILP (Venable_character_translation))
3072 translation_table = Qnil;
3073 else
3074 {
3075 translation_table = coding->translation_table_for_decode;
3076 if (NILP (translation_table))
3077 translation_table = Vstandard_translation_table_for_decode;
3078 }
3079
3080 coding->produced_char = 0;
3081 while (1)
3082 {
3083 int c, charset, c1, c2 = 0;
3084
3085 src_base = src;
3086 ONE_MORE_BYTE (c1);
3087
3088 if (c1 < 0x80)
3089 {
3090 charset = CHARSET_ASCII;
3091 if (c1 < 0x20)
3092 {
3093 if (c1 == '\r')
3094 {
3095 if (coding->eol_type == CODING_EOL_CRLF)
3096 {
3097 ONE_MORE_BYTE (c2);
3098 if (c2 == '\n')
3099 c1 = c2;
3100 else
3101 /* To process C2 again, SRC is subtracted by 1. */
3102 src--;
3103 }
3104 else if (coding->eol_type == CODING_EOL_CR)
3105 c1 = '\n';
3106 }
3107 else if (c1 == '\n'
3108 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3109 && (coding->eol_type == CODING_EOL_CR
3110 || coding->eol_type == CODING_EOL_CRLF))
3111 {
3112 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3113 goto label_end_of_loop;
3114 }
3115 }
3116 }
3117 else
3118 {
3119 if (sjis_p)
3120 {
3121 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3122 goto label_invalid_code;
3123 if (c1 <= 0x9F || c1 >= 0xE0)
3124 {
3125 /* SJIS -> JISX0208 */
3126 ONE_MORE_BYTE (c2);
3127 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3128 goto label_invalid_code;
3129 DECODE_SJIS (c1, c2, c1, c2);
3130 charset = charset_jisx0208;
3131 }
3132 else
3133 /* SJIS -> JISX0201-Kana */
3134 charset = charset_katakana_jisx0201;
3135 }
3136 else
3137 {
3138 /* BIG5 -> Big5 */
3139 if (c1 < 0xA0 || c1 > 0xFE)
3140 goto label_invalid_code;
3141 ONE_MORE_BYTE (c2);
3142 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3143 goto label_invalid_code;
3144 DECODE_BIG5 (c1, c2, charset, c1, c2);
3145 }
3146 }
3147
3148 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3149 EMIT_CHAR (c);
3150 continue;
3151
3152 label_invalid_code:
3153 coding->errors++;
3154 src = src_base;
3155 c = *src++;
3156 EMIT_CHAR (c);
3157 }
3158
3159 label_end_of_loop:
3160 coding->consumed = coding->consumed_char = src_base - source;
3161 coding->produced = dst - destination;
3162 return;
3163 }
3164
3165 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3166 This function can encode charsets `ascii', `katakana-jisx0201',
3167 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3168 are sure that all these charsets are registered as official charset
3169 (i.e. do not have extended leading-codes). Characters of other
3170 charsets are produced without any encoding. If SJIS_P is 1, encode
3171 SJIS text, else encode BIG5 text. */
3172
3173 static void
3174 encode_coding_sjis_big5 (coding, source, destination,
3175 src_bytes, dst_bytes, sjis_p)
3176 struct coding_system *coding;
3177 unsigned char *source, *destination;
3178 int src_bytes, dst_bytes;
3179 int sjis_p;
3180 {
3181 unsigned char *src = source;
3182 unsigned char *src_end = source + src_bytes;
3183 unsigned char *dst = destination;
3184 unsigned char *dst_end = destination + dst_bytes;
3185 /* SRC_BASE remembers the start position in source in each loop.
3186 The loop will be exited when there's not enough source text to
3187 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3188 there's not enough destination area to produce encoded codes
3189 (within macro EMIT_BYTES). */
3190 unsigned char *src_base;
3191 Lisp_Object translation_table;
3192
3193 if (NILP (Venable_character_translation))
3194 translation_table = Qnil;
3195 else
3196 {
3197 translation_table = coding->translation_table_for_encode;
3198 if (NILP (translation_table))
3199 translation_table = Vstandard_translation_table_for_encode;
3200 }
3201
3202 while (1)
3203 {
3204 int c, charset, c1, c2;
3205
3206 src_base = src;
3207 ONE_MORE_CHAR (c);
3208
3209 /* Now encode the character C. */
3210 if (SINGLE_BYTE_CHAR_P (c))
3211 {
3212 switch (c)
3213 {
3214 case '\r':
3215 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3216 {
3217 EMIT_ONE_BYTE (c);
3218 break;
3219 }
3220 c = '\n';
3221 case '\n':
3222 if (coding->eol_type == CODING_EOL_CRLF)
3223 {
3224 EMIT_TWO_BYTES ('\r', c);
3225 break;
3226 }
3227 else if (coding->eol_type == CODING_EOL_CR)
3228 c = '\r';
3229 default:
3230 EMIT_ONE_BYTE (c);
3231 }
3232 }
3233 else
3234 {
3235 SPLIT_CHAR (c, charset, c1, c2);
3236 if (sjis_p)
3237 {
3238 if (charset == charset_jisx0208
3239 || charset == charset_jisx0208_1978)
3240 {
3241 ENCODE_SJIS (c1, c2, c1, c2);
3242 EMIT_TWO_BYTES (c1, c2);
3243 }
3244 else if (charset == charset_katakana_jisx0201)
3245 EMIT_ONE_BYTE (c1 | 0x80);
3246 else if (charset == charset_latin_jisx0201)
3247 EMIT_ONE_BYTE (c1);
3248 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3249 {
3250 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3251 if (CHARSET_WIDTH (charset) > 1)
3252 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3253 }
3254 else
3255 /* There's no way other than producing the internal
3256 codes as is. */
3257 EMIT_BYTES (src_base, src);
3258 }
3259 else
3260 {
3261 if (charset == charset_big5_1 || charset == charset_big5_2)
3262 {
3263 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3264 EMIT_TWO_BYTES (c1, c2);
3265 }
3266 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3267 {
3268 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3269 if (CHARSET_WIDTH (charset) > 1)
3270 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3271 }
3272 else
3273 /* There's no way other than producing the internal
3274 codes as is. */
3275 EMIT_BYTES (src_base, src);
3276 }
3277 }
3278 coding->consumed_char++;
3279 }
3280
3281 label_end_of_loop:
3282 coding->consumed = src_base - source;
3283 coding->produced = coding->produced_char = dst - destination;
3284 }
3285
3286 \f
3287 /*** 5. CCL handlers ***/
3288
3289 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3290 Check if a text is encoded in a coding system of which
3291 encoder/decoder are written in CCL program. If it is, return
3292 CODING_CATEGORY_MASK_CCL, else return 0. */
3293
3294 static int
3295 detect_coding_ccl (src, src_end, multibytep)
3296 unsigned char *src, *src_end;
3297 int multibytep;
3298 {
3299 unsigned char *valid;
3300 int c;
3301 /* Dummy for ONE_MORE_BYTE. */
3302 struct coding_system dummy_coding;
3303 struct coding_system *coding = &dummy_coding;
3304
3305 /* No coding system is assigned to coding-category-ccl. */
3306 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3307 return 0;
3308
3309 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3310 while (1)
3311 {
3312 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3313 if (! valid[c])
3314 return 0;
3315 }
3316 }
3317
3318 \f
3319 /*** 6. End-of-line handlers ***/
3320
3321 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3322
3323 static void
3324 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3325 struct coding_system *coding;
3326 const unsigned char *source;
3327 unsigned char *destination;
3328 int src_bytes, dst_bytes;
3329 {
3330 const unsigned char *src = source;
3331 unsigned char *dst = destination;
3332 const unsigned char *src_end = src + src_bytes;
3333 unsigned char *dst_end = dst + dst_bytes;
3334 Lisp_Object translation_table;
3335 /* SRC_BASE remembers the start position in source in each loop.
3336 The loop will be exited when there's not enough source code
3337 (within macro ONE_MORE_BYTE), or when there's not enough
3338 destination area to produce a character (within macro
3339 EMIT_CHAR). */
3340 const unsigned char *src_base;
3341 int c;
3342
3343 translation_table = Qnil;
3344 switch (coding->eol_type)
3345 {
3346 case CODING_EOL_CRLF:
3347 while (1)
3348 {
3349 src_base = src;
3350 ONE_MORE_BYTE (c);
3351 if (c == '\r')
3352 {
3353 ONE_MORE_BYTE (c);
3354 if (c != '\n')
3355 {
3356 src--;
3357 c = '\r';
3358 }
3359 }
3360 else if (c == '\n'
3361 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3362 {
3363 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3364 goto label_end_of_loop;
3365 }
3366 EMIT_CHAR (c);
3367 }
3368 break;
3369
3370 case CODING_EOL_CR:
3371 while (1)
3372 {
3373 src_base = src;
3374 ONE_MORE_BYTE (c);
3375 if (c == '\n')
3376 {
3377 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3378 {
3379 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3380 goto label_end_of_loop;
3381 }
3382 }
3383 else if (c == '\r')
3384 c = '\n';
3385 EMIT_CHAR (c);
3386 }
3387 break;
3388
3389 default: /* no need for EOL handling */
3390 while (1)
3391 {
3392 src_base = src;
3393 ONE_MORE_BYTE (c);
3394 EMIT_CHAR (c);
3395 }
3396 }
3397
3398 label_end_of_loop:
3399 coding->consumed = coding->consumed_char = src_base - source;
3400 coding->produced = dst - destination;
3401 return;
3402 }
3403
3404 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3405 format of end-of-line according to `coding->eol_type'. It also
3406 convert multibyte form 8-bit characters to unibyte if
3407 CODING->src_multibyte is nonzero. If `coding->mode &
3408 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3409 also means end-of-line. */
3410
3411 static void
3412 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3413 struct coding_system *coding;
3414 const unsigned char *source;
3415 unsigned char *destination;
3416 int src_bytes, dst_bytes;
3417 {
3418 const unsigned char *src = source;
3419 unsigned char *dst = destination;
3420 const unsigned char *src_end = src + src_bytes;
3421 unsigned char *dst_end = dst + dst_bytes;
3422 Lisp_Object translation_table;
3423 /* SRC_BASE remembers the start position in source in each loop.
3424 The loop will be exited when there's not enough source text to
3425 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3426 there's not enough destination area to produce encoded codes
3427 (within macro EMIT_BYTES). */
3428 const unsigned char *src_base;
3429 unsigned char *tmp;
3430 int c;
3431 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3432
3433 translation_table = Qnil;
3434 if (coding->src_multibyte
3435 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3436 {
3437 src_end--;
3438 src_bytes--;
3439 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3440 }
3441
3442 if (coding->eol_type == CODING_EOL_CRLF)
3443 {
3444 while (src < src_end)
3445 {
3446 src_base = src;
3447 c = *src++;
3448 if (c >= 0x20)
3449 EMIT_ONE_BYTE (c);
3450 else if (c == '\n' || (c == '\r' && selective_display))
3451 EMIT_TWO_BYTES ('\r', '\n');
3452 else
3453 EMIT_ONE_BYTE (c);
3454 }
3455 src_base = src;
3456 label_end_of_loop:
3457 ;
3458 }
3459 else
3460 {
3461 if (!dst_bytes || src_bytes <= dst_bytes)
3462 {
3463 safe_bcopy (src, dst, src_bytes);
3464 src_base = src_end;
3465 dst += src_bytes;
3466 }
3467 else
3468 {
3469 if (coding->src_multibyte
3470 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3471 dst_bytes--;
3472 safe_bcopy (src, dst, dst_bytes);
3473 src_base = src + dst_bytes;
3474 dst = destination + dst_bytes;
3475 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3476 }
3477 if (coding->eol_type == CODING_EOL_CR)
3478 {
3479 for (tmp = destination; tmp < dst; tmp++)
3480 if (*tmp == '\n') *tmp = '\r';
3481 }
3482 else if (selective_display)
3483 {
3484 for (tmp = destination; tmp < dst; tmp++)
3485 if (*tmp == '\r') *tmp = '\n';
3486 }
3487 }
3488 if (coding->src_multibyte)
3489 dst = destination + str_as_unibyte (destination, dst - destination);
3490
3491 coding->consumed = src_base - source;
3492 coding->produced = dst - destination;
3493 coding->produced_char = coding->produced;
3494 }
3495
3496 \f
3497 /*** 7. C library functions ***/
3498
3499 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3500 has a property `coding-system'. The value of this property is a
3501 vector of length 5 (called the coding-vector). Among elements of
3502 this vector, the first (element[0]) and the fifth (element[4])
3503 carry important information for decoding/encoding. Before
3504 decoding/encoding, this information should be set in fields of a
3505 structure of type `coding_system'.
3506
3507 The value of the property `coding-system' can be a symbol of another
3508 subsidiary coding-system. In that case, Emacs gets coding-vector
3509 from that symbol.
3510
3511 `element[0]' contains information to be set in `coding->type'. The
3512 value and its meaning is as follows:
3513
3514 0 -- coding_type_emacs_mule
3515 1 -- coding_type_sjis
3516 2 -- coding_type_iso2022
3517 3 -- coding_type_big5
3518 4 -- coding_type_ccl encoder/decoder written in CCL
3519 nil -- coding_type_no_conversion
3520 t -- coding_type_undecided (automatic conversion on decoding,
3521 no-conversion on encoding)
3522
3523 `element[4]' contains information to be set in `coding->flags' and
3524 `coding->spec'. The meaning varies by `coding->type'.
3525
3526 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3527 of length 32 (of which the first 13 sub-elements are used now).
3528 Meanings of these sub-elements are:
3529
3530 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3531 If the value is an integer of valid charset, the charset is
3532 assumed to be designated to graphic register N initially.
3533
3534 If the value is minus, it is a minus value of charset which
3535 reserves graphic register N, which means that the charset is
3536 not designated initially but should be designated to graphic
3537 register N just before encoding a character in that charset.
3538
3539 If the value is nil, graphic register N is never used on
3540 encoding.
3541
3542 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3543 Each value takes t or nil. See the section ISO2022 of
3544 `coding.h' for more information.
3545
3546 If `coding->type' is `coding_type_big5', element[4] is t to denote
3547 BIG5-ETen or nil to denote BIG5-HKU.
3548
3549 If `coding->type' takes the other value, element[4] is ignored.
3550
3551 Emacs Lisp's coding systems also carry information about format of
3552 end-of-line in a value of property `eol-type'. If the value is
3553 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3554 means CODING_EOL_CR. If it is not integer, it should be a vector
3555 of subsidiary coding systems of which property `eol-type' has one
3556 of the above values.
3557
3558 */
3559
3560 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3561 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3562 is setup so that no conversion is necessary and return -1, else
3563 return 0. */
3564
3565 int
3566 setup_coding_system (coding_system, coding)
3567 Lisp_Object coding_system;
3568 struct coding_system *coding;
3569 {
3570 Lisp_Object coding_spec, coding_type, eol_type, plist;
3571 Lisp_Object val;
3572
3573 /* At first, zero clear all members. */
3574 bzero (coding, sizeof (struct coding_system));
3575
3576 /* Initialize some fields required for all kinds of coding systems. */
3577 coding->symbol = coding_system;
3578 coding->heading_ascii = -1;
3579 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3580 coding->composing = COMPOSITION_DISABLED;
3581 coding->cmp_data = NULL;
3582
3583 if (NILP (coding_system))
3584 goto label_invalid_coding_system;
3585
3586 coding_spec = Fget (coding_system, Qcoding_system);
3587
3588 if (!VECTORP (coding_spec)
3589 || XVECTOR (coding_spec)->size != 5
3590 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3591 goto label_invalid_coding_system;
3592
3593 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3594 if (VECTORP (eol_type))
3595 {
3596 coding->eol_type = CODING_EOL_UNDECIDED;
3597 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3598 if (system_eol_type != CODING_EOL_LF)
3599 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3600 }
3601 else if (XFASTINT (eol_type) == 1)
3602 {
3603 coding->eol_type = CODING_EOL_CRLF;
3604 coding->common_flags
3605 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3606 }
3607 else if (XFASTINT (eol_type) == 2)
3608 {
3609 coding->eol_type = CODING_EOL_CR;
3610 coding->common_flags
3611 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3612 }
3613 else
3614 {
3615 coding->common_flags = 0;
3616 coding->eol_type = CODING_EOL_LF;
3617 }
3618
3619 coding_type = XVECTOR (coding_spec)->contents[0];
3620 /* Try short cut. */
3621 if (SYMBOLP (coding_type))
3622 {
3623 if (EQ (coding_type, Qt))
3624 {
3625 coding->type = coding_type_undecided;
3626 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3627 }
3628 else
3629 coding->type = coding_type_no_conversion;
3630 /* Initialize this member. Any thing other than
3631 CODING_CATEGORY_IDX_UTF_16_BE and
3632 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3633 special treatment in detect_eol. */
3634 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3635
3636 return 0;
3637 }
3638
3639 /* Get values of coding system properties:
3640 `post-read-conversion', `pre-write-conversion',
3641 `translation-table-for-decode', `translation-table-for-encode'. */
3642 plist = XVECTOR (coding_spec)->contents[3];
3643 /* Pre & post conversion functions should be disabled if
3644 inhibit_eol_conversion is nonzero. This is the case that a code
3645 conversion function is called while those functions are running. */
3646 if (! inhibit_pre_post_conversion)
3647 {
3648 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3649 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3650 }
3651 val = Fplist_get (plist, Qtranslation_table_for_decode);
3652 if (SYMBOLP (val))
3653 val = Fget (val, Qtranslation_table_for_decode);
3654 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3655 val = Fplist_get (plist, Qtranslation_table_for_encode);
3656 if (SYMBOLP (val))
3657 val = Fget (val, Qtranslation_table_for_encode);
3658 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3659 val = Fplist_get (plist, Qcoding_category);
3660 if (!NILP (val))
3661 {
3662 val = Fget (val, Qcoding_category_index);
3663 if (INTEGERP (val))
3664 coding->category_idx = XINT (val);
3665 else
3666 goto label_invalid_coding_system;
3667 }
3668 else
3669 goto label_invalid_coding_system;
3670
3671 /* If the coding system has non-nil `composition' property, enable
3672 composition handling. */
3673 val = Fplist_get (plist, Qcomposition);
3674 if (!NILP (val))
3675 coding->composing = COMPOSITION_NO;
3676
3677 /* If the coding system is ascii-incompatible, record it in
3678 common_flags. */
3679 val = Fplist_get (plist, Qascii_incompatible);
3680 if (! NILP (val))
3681 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3682
3683 switch (XFASTINT (coding_type))
3684 {
3685 case 0:
3686 coding->type = coding_type_emacs_mule;
3687 coding->common_flags
3688 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3689 if (!NILP (coding->post_read_conversion))
3690 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3691 if (!NILP (coding->pre_write_conversion))
3692 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3693 break;
3694
3695 case 1:
3696 coding->type = coding_type_sjis;
3697 coding->common_flags
3698 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3699 break;
3700
3701 case 2:
3702 coding->type = coding_type_iso2022;
3703 coding->common_flags
3704 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3705 {
3706 Lisp_Object val, temp;
3707 Lisp_Object *flags;
3708 int i, charset, reg_bits = 0;
3709
3710 val = XVECTOR (coding_spec)->contents[4];
3711
3712 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3713 goto label_invalid_coding_system;
3714
3715 flags = XVECTOR (val)->contents;
3716 coding->flags
3717 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3718 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3719 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3720 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3721 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3722 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3723 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3724 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3725 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3726 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3727 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3728 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3729 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3730 );
3731
3732 /* Invoke graphic register 0 to plane 0. */
3733 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3734 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3735 CODING_SPEC_ISO_INVOCATION (coding, 1)
3736 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3737 /* Not single shifting at first. */
3738 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3739 /* Beginning of buffer should also be regarded as bol. */
3740 CODING_SPEC_ISO_BOL (coding) = 1;
3741
3742 for (charset = 0; charset <= MAX_CHARSET; charset++)
3743 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3744 val = Vcharset_revision_alist;
3745 while (CONSP (val))
3746 {
3747 charset = get_charset_id (Fcar_safe (XCAR (val)));
3748 if (charset >= 0
3749 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3750 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3751 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3752 val = XCDR (val);
3753 }
3754
3755 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3756 FLAGS[REG] can be one of below:
3757 integer CHARSET: CHARSET occupies register I,
3758 t: designate nothing to REG initially, but can be used
3759 by any charsets,
3760 list of integer, nil, or t: designate the first
3761 element (if integer) to REG initially, the remaining
3762 elements (if integer) is designated to REG on request,
3763 if an element is t, REG can be used by any charsets,
3764 nil: REG is never used. */
3765 for (charset = 0; charset <= MAX_CHARSET; charset++)
3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3768 for (i = 0; i < 4; i++)
3769 {
3770 if ((INTEGERP (flags[i])
3771 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3772 || (charset = get_charset_id (flags[i])) >= 0)
3773 {
3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3775 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3776 }
3777 else if (EQ (flags[i], Qt))
3778 {
3779 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3780 reg_bits |= 1 << i;
3781 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3782 }
3783 else if (CONSP (flags[i]))
3784 {
3785 Lisp_Object tail;
3786 tail = flags[i];
3787
3788 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3789 if ((INTEGERP (XCAR (tail))
3790 && (charset = XINT (XCAR (tail)),
3791 CHARSET_VALID_P (charset)))
3792 || (charset = get_charset_id (XCAR (tail))) >= 0)
3793 {
3794 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3795 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3796 }
3797 else
3798 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3799 tail = XCDR (tail);
3800 while (CONSP (tail))
3801 {
3802 if ((INTEGERP (XCAR (tail))
3803 && (charset = XINT (XCAR (tail)),
3804 CHARSET_VALID_P (charset)))
3805 || (charset = get_charset_id (XCAR (tail))) >= 0)
3806 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3807 = i;
3808 else if (EQ (XCAR (tail), Qt))
3809 reg_bits |= 1 << i;
3810 tail = XCDR (tail);
3811 }
3812 }
3813 else
3814 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3815
3816 CODING_SPEC_ISO_DESIGNATION (coding, i)
3817 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3818 }
3819
3820 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3821 {
3822 /* REG 1 can be used only by locking shift in 7-bit env. */
3823 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3824 reg_bits &= ~2;
3825 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3826 /* Without any shifting, only REG 0 and 1 can be used. */
3827 reg_bits &= 3;
3828 }
3829
3830 if (reg_bits)
3831 for (charset = 0; charset <= MAX_CHARSET; charset++)
3832 {
3833 if (CHARSET_DEFINED_P (charset)
3834 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3835 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3836 {
3837 /* There exist some default graphic registers to be
3838 used by CHARSET. */
3839
3840 /* We had better avoid designating a charset of
3841 CHARS96 to REG 0 as far as possible. */
3842 if (CHARSET_CHARS (charset) == 96)
3843 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3844 = (reg_bits & 2
3845 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3846 else
3847 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848 = (reg_bits & 1
3849 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3850 }
3851 }
3852 }
3853 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3854 coding->spec.iso2022.last_invalid_designation_register = -1;
3855 break;
3856
3857 case 3:
3858 coding->type = coding_type_big5;
3859 coding->common_flags
3860 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3861 coding->flags
3862 = (NILP (XVECTOR (coding_spec)->contents[4])
3863 ? CODING_FLAG_BIG5_HKU
3864 : CODING_FLAG_BIG5_ETEN);
3865 break;
3866
3867 case 4:
3868 coding->type = coding_type_ccl;
3869 coding->common_flags
3870 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3871 {
3872 val = XVECTOR (coding_spec)->contents[4];
3873 if (! CONSP (val)
3874 || setup_ccl_program (&(coding->spec.ccl.decoder),
3875 XCAR (val)) < 0
3876 || setup_ccl_program (&(coding->spec.ccl.encoder),
3877 XCDR (val)) < 0)
3878 goto label_invalid_coding_system;
3879
3880 bzero (coding->spec.ccl.valid_codes, 256);
3881 val = Fplist_get (plist, Qvalid_codes);
3882 if (CONSP (val))
3883 {
3884 Lisp_Object this;
3885
3886 for (; CONSP (val); val = XCDR (val))
3887 {
3888 this = XCAR (val);
3889 if (INTEGERP (this)
3890 && XINT (this) >= 0 && XINT (this) < 256)
3891 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3892 else if (CONSP (this)
3893 && INTEGERP (XCAR (this))
3894 && INTEGERP (XCDR (this)))
3895 {
3896 int start = XINT (XCAR (this));
3897 int end = XINT (XCDR (this));
3898
3899 if (start >= 0 && start <= end && end < 256)
3900 while (start <= end)
3901 coding->spec.ccl.valid_codes[start++] = 1;
3902 }
3903 }
3904 }
3905 }
3906 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3907 coding->spec.ccl.cr_carryover = 0;
3908 coding->spec.ccl.eight_bit_carryover[0] = 0;
3909 break;
3910
3911 case 5:
3912 coding->type = coding_type_raw_text;
3913 break;
3914
3915 default:
3916 goto label_invalid_coding_system;
3917 }
3918 return 0;
3919
3920 label_invalid_coding_system:
3921 coding->type = coding_type_no_conversion;
3922 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3923 coding->common_flags = 0;
3924 coding->eol_type = CODING_EOL_UNDECIDED;
3925 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3926 return NILP (coding_system) ? 0 : -1;
3927 }
3928
3929 /* Free memory blocks allocated for storing composition information. */
3930
3931 void
3932 coding_free_composition_data (coding)
3933 struct coding_system *coding;
3934 {
3935 struct composition_data *cmp_data = coding->cmp_data, *next;
3936
3937 if (!cmp_data)
3938 return;
3939 /* Memory blocks are chained. At first, rewind to the first, then,
3940 free blocks one by one. */
3941 while (cmp_data->prev)
3942 cmp_data = cmp_data->prev;
3943 while (cmp_data)
3944 {
3945 next = cmp_data->next;
3946 xfree (cmp_data);
3947 cmp_data = next;
3948 }
3949 coding->cmp_data = NULL;
3950 }
3951
3952 /* Set `char_offset' member of all memory blocks pointed by
3953 coding->cmp_data to POS. */
3954
3955 void
3956 coding_adjust_composition_offset (coding, pos)
3957 struct coding_system *coding;
3958 int pos;
3959 {
3960 struct composition_data *cmp_data;
3961
3962 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3963 cmp_data->char_offset = pos;
3964 }
3965
3966 /* Setup raw-text or one of its subsidiaries in the structure
3967 coding_system CODING according to the already setup value eol_type
3968 in CODING. CODING should be setup for some coding system in
3969 advance. */
3970
3971 void
3972 setup_raw_text_coding_system (coding)
3973 struct coding_system *coding;
3974 {
3975 if (coding->type != coding_type_raw_text)
3976 {
3977 coding->symbol = Qraw_text;
3978 coding->type = coding_type_raw_text;
3979 if (coding->eol_type != CODING_EOL_UNDECIDED)
3980 {
3981 Lisp_Object subsidiaries;
3982 subsidiaries = Fget (Qraw_text, Qeol_type);
3983
3984 if (VECTORP (subsidiaries)
3985 && XVECTOR (subsidiaries)->size == 3)
3986 coding->symbol
3987 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3988 }
3989 setup_coding_system (coding->symbol, coding);
3990 }
3991 return;
3992 }
3993
3994 /* Emacs has a mechanism to automatically detect a coding system if it
3995 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3996 it's impossible to distinguish some coding systems accurately
3997 because they use the same range of codes. So, at first, coding
3998 systems are categorized into 7, those are:
3999
4000 o coding-category-emacs-mule
4001
4002 The category for a coding system which has the same code range
4003 as Emacs' internal format. Assigned the coding-system (Lisp
4004 symbol) `emacs-mule' by default.
4005
4006 o coding-category-sjis
4007
4008 The category for a coding system which has the same code range
4009 as SJIS. Assigned the coding-system (Lisp
4010 symbol) `japanese-shift-jis' by default.
4011
4012 o coding-category-iso-7
4013
4014 The category for a coding system which has the same code range
4015 as ISO2022 of 7-bit environment. This doesn't use any locking
4016 shift and single shift functions. This can encode/decode all
4017 charsets. Assigned the coding-system (Lisp symbol)
4018 `iso-2022-7bit' by default.
4019
4020 o coding-category-iso-7-tight
4021
4022 Same as coding-category-iso-7 except that this can
4023 encode/decode only the specified charsets.
4024
4025 o coding-category-iso-8-1
4026
4027 The category for a coding system which has the same code range
4028 as ISO2022 of 8-bit environment and graphic plane 1 used only
4029 for DIMENSION1 charset. This doesn't use any locking shift
4030 and single shift functions. Assigned the coding-system (Lisp
4031 symbol) `iso-latin-1' by default.
4032
4033 o coding-category-iso-8-2
4034
4035 The category for a coding system which has the same code range
4036 as ISO2022 of 8-bit environment and graphic plane 1 used only
4037 for DIMENSION2 charset. This doesn't use any locking shift
4038 and single shift functions. Assigned the coding-system (Lisp
4039 symbol) `japanese-iso-8bit' by default.
4040
4041 o coding-category-iso-7-else
4042
4043 The category for a coding system which has the same code range
4044 as ISO2022 of 7-bit environment but uses locking shift or
4045 single shift functions. Assigned the coding-system (Lisp
4046 symbol) `iso-2022-7bit-lock' by default.
4047
4048 o coding-category-iso-8-else
4049
4050 The category for a coding system which has the same code range
4051 as ISO2022 of 8-bit environment but uses locking shift or
4052 single shift functions. Assigned the coding-system (Lisp
4053 symbol) `iso-2022-8bit-ss2' by default.
4054
4055 o coding-category-big5
4056
4057 The category for a coding system which has the same code range
4058 as BIG5. Assigned the coding-system (Lisp symbol)
4059 `cn-big5' by default.
4060
4061 o coding-category-utf-8
4062
4063 The category for a coding system which has the same code range
4064 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4065 symbol) `utf-8' by default.
4066
4067 o coding-category-utf-16-be
4068
4069 The category for a coding system in which a text has an
4070 Unicode signature (cf. Unicode Standard) in the order of BIG
4071 endian at the head. Assigned the coding-system (Lisp symbol)
4072 `utf-16-be' by default.
4073
4074 o coding-category-utf-16-le
4075
4076 The category for a coding system in which a text has an
4077 Unicode signature (cf. Unicode Standard) in the order of
4078 LITTLE endian at the head. Assigned the coding-system (Lisp
4079 symbol) `utf-16-le' by default.
4080
4081 o coding-category-ccl
4082
4083 The category for a coding system of which encoder/decoder is
4084 written in CCL programs. The default value is nil, i.e., no
4085 coding system is assigned.
4086
4087 o coding-category-binary
4088
4089 The category for a coding system not categorized in any of the
4090 above. Assigned the coding-system (Lisp symbol)
4091 `no-conversion' by default.
4092
4093 Each of them is a Lisp symbol and the value is an actual
4094 `coding-system' (this is also a Lisp symbol) assigned by a user.
4095 What Emacs does actually is to detect a category of coding system.
4096 Then, it uses a `coding-system' assigned to it. If Emacs can't
4097 decide a single possible category, it selects a category of the
4098 highest priority. Priorities of categories are also specified by a
4099 user in a Lisp variable `coding-category-list'.
4100
4101 */
4102
4103 static
4104 int ascii_skip_code[256];
4105
4106 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4107 If it detects possible coding systems, return an integer in which
4108 appropriate flag bits are set. Flag bits are defined by macros
4109 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4110 it should point the table `coding_priorities'. In that case, only
4111 the flag bit for a coding system of the highest priority is set in
4112 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4113 range 0x80..0x9F are in multibyte form.
4114
4115 How many ASCII characters are at the head is returned as *SKIP. */
4116
4117 static int
4118 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4119 unsigned char *source;
4120 int src_bytes, *priorities, *skip;
4121 int multibytep;
4122 {
4123 register unsigned char c;
4124 unsigned char *src = source, *src_end = source + src_bytes;
4125 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4126 int i;
4127
4128 /* At first, skip all ASCII characters and control characters except
4129 for three ISO2022 specific control characters. */
4130 ascii_skip_code[ISO_CODE_SO] = 0;
4131 ascii_skip_code[ISO_CODE_SI] = 0;
4132 ascii_skip_code[ISO_CODE_ESC] = 0;
4133
4134 label_loop_detect_coding:
4135 while (src < src_end && ascii_skip_code[*src]) src++;
4136 *skip = src - source;
4137
4138 if (src >= src_end)
4139 /* We found nothing other than ASCII. There's nothing to do. */
4140 return 0;
4141
4142 c = *src;
4143 /* The text seems to be encoded in some multilingual coding system.
4144 Now, try to find in which coding system the text is encoded. */
4145 if (c < 0x80)
4146 {
4147 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4148 /* C is an ISO2022 specific control code of C0. */
4149 mask = detect_coding_iso2022 (src, src_end, multibytep);
4150 if (mask == 0)
4151 {
4152 /* No valid ISO2022 code follows C. Try again. */
4153 src++;
4154 if (c == ISO_CODE_ESC)
4155 ascii_skip_code[ISO_CODE_ESC] = 1;
4156 else
4157 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4158 goto label_loop_detect_coding;
4159 }
4160 if (priorities)
4161 {
4162 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4163 {
4164 if (mask & priorities[i])
4165 return priorities[i];
4166 }
4167 return CODING_CATEGORY_MASK_RAW_TEXT;
4168 }
4169 }
4170 else
4171 {
4172 int try;
4173
4174 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4175 c = src[1] - 0x20;
4176
4177 if (c < 0xA0)
4178 {
4179 /* C is the first byte of SJIS character code,
4180 or a leading-code of Emacs' internal format (emacs-mule),
4181 or the first byte of UTF-16. */
4182 try = (CODING_CATEGORY_MASK_SJIS
4183 | CODING_CATEGORY_MASK_EMACS_MULE
4184 | CODING_CATEGORY_MASK_UTF_16_BE
4185 | CODING_CATEGORY_MASK_UTF_16_LE);
4186
4187 /* Or, if C is a special latin extra code,
4188 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4189 or is an ISO2022 control-sequence-introducer (CSI),
4190 we should also consider the possibility of ISO2022 codings. */
4191 if ((VECTORP (Vlatin_extra_code_table)
4192 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4193 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4194 || (c == ISO_CODE_CSI
4195 && (src < src_end
4196 && (*src == ']'
4197 || ((*src == '0' || *src == '1' || *src == '2')
4198 && src + 1 < src_end
4199 && src[1] == ']')))))
4200 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4201 | CODING_CATEGORY_MASK_ISO_8BIT);
4202 }
4203 else
4204 /* C is a character of ISO2022 in graphic plane right,
4205 or a SJIS's 1-byte character code (i.e. JISX0201),
4206 or the first byte of BIG5's 2-byte code,
4207 or the first byte of UTF-8/16. */
4208 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4209 | CODING_CATEGORY_MASK_ISO_8BIT
4210 | CODING_CATEGORY_MASK_SJIS
4211 | CODING_CATEGORY_MASK_BIG5
4212 | CODING_CATEGORY_MASK_UTF_8
4213 | CODING_CATEGORY_MASK_UTF_16_BE
4214 | CODING_CATEGORY_MASK_UTF_16_LE);
4215
4216 /* Or, we may have to consider the possibility of CCL. */
4217 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4218 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4219 ->spec.ccl.valid_codes)[c])
4220 try |= CODING_CATEGORY_MASK_CCL;
4221
4222 mask = 0;
4223 utf16_examined_p = iso2022_examined_p = 0;
4224 if (priorities)
4225 {
4226 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4227 {
4228 if (!iso2022_examined_p
4229 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4230 {
4231 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4232 iso2022_examined_p = 1;
4233 }
4234 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4235 mask |= detect_coding_sjis (src, src_end, multibytep);
4236 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4237 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4238 else if (!utf16_examined_p
4239 && (priorities[i] & try &
4240 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4241 {
4242 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4243 utf16_examined_p = 1;
4244 }
4245 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4246 mask |= detect_coding_big5 (src, src_end, multibytep);
4247 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4248 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4249 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4250 mask |= detect_coding_ccl (src, src_end, multibytep);
4251 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4252 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4253 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4254 mask |= CODING_CATEGORY_MASK_BINARY;
4255 if (mask & priorities[i])
4256 return priorities[i];
4257 }
4258 return CODING_CATEGORY_MASK_RAW_TEXT;
4259 }
4260 if (try & CODING_CATEGORY_MASK_ISO)
4261 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4262 if (try & CODING_CATEGORY_MASK_SJIS)
4263 mask |= detect_coding_sjis (src, src_end, multibytep);
4264 if (try & CODING_CATEGORY_MASK_BIG5)
4265 mask |= detect_coding_big5 (src, src_end, multibytep);
4266 if (try & CODING_CATEGORY_MASK_UTF_8)
4267 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4268 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4269 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4270 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4271 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4272 if (try & CODING_CATEGORY_MASK_CCL)
4273 mask |= detect_coding_ccl (src, src_end, multibytep);
4274 }
4275 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4276 }
4277
4278 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4279 The information of the detected coding system is set in CODING. */
4280
4281 void
4282 detect_coding (coding, src, src_bytes)
4283 struct coding_system *coding;
4284 const unsigned char *src;
4285 int src_bytes;
4286 {
4287 unsigned int idx;
4288 int skip, mask;
4289 Lisp_Object val;
4290
4291 val = Vcoding_category_list;
4292 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4293 coding->src_multibyte);
4294 coding->heading_ascii = skip;
4295
4296 if (!mask) return;
4297
4298 /* We found a single coding system of the highest priority in MASK. */
4299 idx = 0;
4300 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4301 if (! mask)
4302 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4303
4304 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4305
4306 if (coding->eol_type != CODING_EOL_UNDECIDED)
4307 {
4308 Lisp_Object tmp;
4309
4310 tmp = Fget (val, Qeol_type);
4311 if (VECTORP (tmp))
4312 val = XVECTOR (tmp)->contents[coding->eol_type];
4313 }
4314
4315 /* Setup this new coding system while preserving some slots. */
4316 {
4317 int src_multibyte = coding->src_multibyte;
4318 int dst_multibyte = coding->dst_multibyte;
4319
4320 setup_coding_system (val, coding);
4321 coding->src_multibyte = src_multibyte;
4322 coding->dst_multibyte = dst_multibyte;
4323 coding->heading_ascii = skip;
4324 }
4325 }
4326
4327 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4328 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4329 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4330
4331 How many non-eol characters are at the head is returned as *SKIP. */
4332
4333 #define MAX_EOL_CHECK_COUNT 3
4334
4335 static int
4336 detect_eol_type (source, src_bytes, skip)
4337 unsigned char *source;
4338 int src_bytes, *skip;
4339 {
4340 unsigned char *src = source, *src_end = src + src_bytes;
4341 unsigned char c;
4342 int total = 0; /* How many end-of-lines are found so far. */
4343 int eol_type = CODING_EOL_UNDECIDED;
4344 int this_eol_type;
4345
4346 *skip = 0;
4347
4348 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4349 {
4350 c = *src++;
4351 if (c == '\n' || c == '\r')
4352 {
4353 if (*skip == 0)
4354 *skip = src - 1 - source;
4355 total++;
4356 if (c == '\n')
4357 this_eol_type = CODING_EOL_LF;
4358 else if (src >= src_end || *src != '\n')
4359 this_eol_type = CODING_EOL_CR;
4360 else
4361 this_eol_type = CODING_EOL_CRLF, src++;
4362
4363 if (eol_type == CODING_EOL_UNDECIDED)
4364 /* This is the first end-of-line. */
4365 eol_type = this_eol_type;
4366 else if (eol_type != this_eol_type)
4367 {
4368 /* The found type is different from what found before. */
4369 eol_type = CODING_EOL_INCONSISTENT;
4370 break;
4371 }
4372 }
4373 }
4374
4375 if (*skip == 0)
4376 *skip = src_end - source;
4377 return eol_type;
4378 }
4379
4380 /* Like detect_eol_type, but detect EOL type in 2-octet
4381 big-endian/little-endian format for coding systems utf-16-be and
4382 utf-16-le. */
4383
4384 static int
4385 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4386 unsigned char *source;
4387 int src_bytes, *skip, big_endian_p;
4388 {
4389 unsigned char *src = source, *src_end = src + src_bytes;
4390 unsigned int c1, c2;
4391 int total = 0; /* How many end-of-lines are found so far. */
4392 int eol_type = CODING_EOL_UNDECIDED;
4393 int this_eol_type;
4394 int msb, lsb;
4395
4396 if (big_endian_p)
4397 msb = 0, lsb = 1;
4398 else
4399 msb = 1, lsb = 0;
4400
4401 *skip = 0;
4402
4403 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4404 {
4405 c1 = (src[msb] << 8) | (src[lsb]);
4406 src += 2;
4407
4408 if (c1 == '\n' || c1 == '\r')
4409 {
4410 if (*skip == 0)
4411 *skip = src - 2 - source;
4412 total++;
4413 if (c1 == '\n')
4414 {
4415 this_eol_type = CODING_EOL_LF;
4416 }
4417 else
4418 {
4419 if ((src + 1) >= src_end)
4420 {
4421 this_eol_type = CODING_EOL_CR;
4422 }
4423 else
4424 {
4425 c2 = (src[msb] << 8) | (src[lsb]);
4426 if (c2 == '\n')
4427 this_eol_type = CODING_EOL_CRLF, src += 2;
4428 else
4429 this_eol_type = CODING_EOL_CR;
4430 }
4431 }
4432
4433 if (eol_type == CODING_EOL_UNDECIDED)
4434 /* This is the first end-of-line. */
4435 eol_type = this_eol_type;
4436 else if (eol_type != this_eol_type)
4437 {
4438 /* The found type is different from what found before. */
4439 eol_type = CODING_EOL_INCONSISTENT;
4440 break;
4441 }
4442 }
4443 }
4444
4445 if (*skip == 0)
4446 *skip = src_end - source;
4447 return eol_type;
4448 }
4449
4450 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4451 is encoded. If it detects an appropriate format of end-of-line, it
4452 sets the information in *CODING. */
4453
4454 void
4455 detect_eol (coding, src, src_bytes)
4456 struct coding_system *coding;
4457 const unsigned char *src;
4458 int src_bytes;
4459 {
4460 Lisp_Object val;
4461 int skip;
4462 int eol_type;
4463
4464 switch (coding->category_idx)
4465 {
4466 case CODING_CATEGORY_IDX_UTF_16_BE:
4467 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4468 break;
4469 case CODING_CATEGORY_IDX_UTF_16_LE:
4470 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4471 break;
4472 default:
4473 eol_type = detect_eol_type (src, src_bytes, &skip);
4474 break;
4475 }
4476
4477 if (coding->heading_ascii > skip)
4478 coding->heading_ascii = skip;
4479 else
4480 skip = coding->heading_ascii;
4481
4482 if (eol_type == CODING_EOL_UNDECIDED)
4483 return;
4484 if (eol_type == CODING_EOL_INCONSISTENT)
4485 {
4486 #if 0
4487 /* This code is suppressed until we find a better way to
4488 distinguish raw text file and binary file. */
4489
4490 /* If we have already detected that the coding is raw-text, the
4491 coding should actually be no-conversion. */
4492 if (coding->type == coding_type_raw_text)
4493 {
4494 setup_coding_system (Qno_conversion, coding);
4495 return;
4496 }
4497 /* Else, let's decode only text code anyway. */
4498 #endif /* 0 */
4499 eol_type = CODING_EOL_LF;
4500 }
4501
4502 val = Fget (coding->symbol, Qeol_type);
4503 if (VECTORP (val) && XVECTOR (val)->size == 3)
4504 {
4505 int src_multibyte = coding->src_multibyte;
4506 int dst_multibyte = coding->dst_multibyte;
4507 struct composition_data *cmp_data = coding->cmp_data;
4508
4509 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4510 coding->src_multibyte = src_multibyte;
4511 coding->dst_multibyte = dst_multibyte;
4512 coding->heading_ascii = skip;
4513 coding->cmp_data = cmp_data;
4514 }
4515 }
4516
4517 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4518
4519 #define DECODING_BUFFER_MAG(coding) \
4520 (coding->type == coding_type_iso2022 \
4521 ? 3 \
4522 : (coding->type == coding_type_ccl \
4523 ? coding->spec.ccl.decoder.buf_magnification \
4524 : 2))
4525
4526 /* Return maximum size (bytes) of a buffer enough for decoding
4527 SRC_BYTES of text encoded in CODING. */
4528
4529 int
4530 decoding_buffer_size (coding, src_bytes)
4531 struct coding_system *coding;
4532 int src_bytes;
4533 {
4534 return (src_bytes * DECODING_BUFFER_MAG (coding)
4535 + CONVERSION_BUFFER_EXTRA_ROOM);
4536 }
4537
4538 /* Return maximum size (bytes) of a buffer enough for encoding
4539 SRC_BYTES of text to CODING. */
4540
4541 int
4542 encoding_buffer_size (coding, src_bytes)
4543 struct coding_system *coding;
4544 int src_bytes;
4545 {
4546 int magnification;
4547
4548 if (coding->type == coding_type_ccl)
4549 {
4550 magnification = coding->spec.ccl.encoder.buf_magnification;
4551 if (coding->eol_type == CODING_EOL_CRLF)
4552 magnification *= 2;
4553 }
4554 else if (CODING_REQUIRE_ENCODING (coding))
4555 magnification = 3;
4556 else
4557 magnification = 1;
4558
4559 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4560 }
4561
4562 /* Working buffer for code conversion. */
4563 struct conversion_buffer
4564 {
4565 int size; /* size of data. */
4566 int on_stack; /* 1 if allocated by alloca. */
4567 unsigned char *data;
4568 };
4569
4570 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4571 #define allocate_conversion_buffer(buf, len) \
4572 do { \
4573 if (len < MAX_ALLOCA) \
4574 { \
4575 buf.data = (unsigned char *) alloca (len); \
4576 buf.on_stack = 1; \
4577 } \
4578 else \
4579 { \
4580 buf.data = (unsigned char *) xmalloc (len); \
4581 buf.on_stack = 0; \
4582 } \
4583 buf.size = len; \
4584 } while (0)
4585
4586 /* Double the allocated memory for *BUF. */
4587 static void
4588 extend_conversion_buffer (buf)
4589 struct conversion_buffer *buf;
4590 {
4591 if (buf->on_stack)
4592 {
4593 unsigned char *save = buf->data;
4594 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4595 bcopy (save, buf->data, buf->size);
4596 buf->on_stack = 0;
4597 }
4598 else
4599 {
4600 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4601 }
4602 buf->size *= 2;
4603 }
4604
4605 /* Free the allocated memory for BUF if it is not on stack. */
4606 static void
4607 free_conversion_buffer (buf)
4608 struct conversion_buffer *buf;
4609 {
4610 if (!buf->on_stack)
4611 xfree (buf->data);
4612 }
4613
4614 int
4615 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4616 struct coding_system *coding;
4617 unsigned char *source, *destination;
4618 int src_bytes, dst_bytes, encodep;
4619 {
4620 struct ccl_program *ccl
4621 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4622 unsigned char *dst = destination;
4623
4624 ccl->suppress_error = coding->suppress_error;
4625 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4626 if (encodep)
4627 {
4628 /* On encoding, EOL format is converted within ccl_driver. For
4629 that, setup proper information in the structure CCL. */
4630 ccl->eol_type = coding->eol_type;
4631 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4632 ccl->eol_type = CODING_EOL_LF;
4633 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4634 ccl->eight_bit_control = coding->dst_multibyte;
4635 }
4636 else
4637 ccl->eight_bit_control = 1;
4638 ccl->multibyte = coding->src_multibyte;
4639 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4640 {
4641 /* Move carryover bytes to DESTINATION. */
4642 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4643 while (*p)
4644 *dst++ = *p++;
4645 coding->spec.ccl.eight_bit_carryover[0] = 0;
4646 if (dst_bytes)
4647 dst_bytes -= dst - destination;
4648 }
4649
4650 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4651 &(coding->consumed))
4652 + dst - destination);
4653
4654 if (encodep)
4655 {
4656 coding->produced_char = coding->produced;
4657 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4658 }
4659 else if (!ccl->eight_bit_control)
4660 {
4661 /* The produced bytes forms a valid multibyte sequence. */
4662 coding->produced_char
4663 = multibyte_chars_in_text (destination, coding->produced);
4664 coding->spec.ccl.eight_bit_carryover[0] = 0;
4665 }
4666 else
4667 {
4668 /* On decoding, the destination should always multibyte. But,
4669 CCL program might have been generated an invalid multibyte
4670 sequence. Here we make such a sequence valid as
4671 multibyte. */
4672 int bytes
4673 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4674
4675 if ((coding->consumed < src_bytes
4676 || !ccl->last_block)
4677 && coding->produced >= 1
4678 && destination[coding->produced - 1] >= 0x80)
4679 {
4680 /* We should not convert the tailing 8-bit codes to
4681 multibyte form even if they doesn't form a valid
4682 multibyte sequence. They may form a valid sequence in
4683 the next call. */
4684 int carryover = 0;
4685
4686 if (destination[coding->produced - 1] < 0xA0)
4687 carryover = 1;
4688 else if (coding->produced >= 2)
4689 {
4690 if (destination[coding->produced - 2] >= 0x80)
4691 {
4692 if (destination[coding->produced - 2] < 0xA0)
4693 carryover = 2;
4694 else if (coding->produced >= 3
4695 && destination[coding->produced - 3] >= 0x80
4696 && destination[coding->produced - 3] < 0xA0)
4697 carryover = 3;
4698 }
4699 }
4700 if (carryover > 0)
4701 {
4702 BCOPY_SHORT (destination + coding->produced - carryover,
4703 coding->spec.ccl.eight_bit_carryover,
4704 carryover);
4705 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4706 coding->produced -= carryover;
4707 }
4708 }
4709 coding->produced = str_as_multibyte (destination, bytes,
4710 coding->produced,
4711 &(coding->produced_char));
4712 }
4713
4714 switch (ccl->status)
4715 {
4716 case CCL_STAT_SUSPEND_BY_SRC:
4717 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4718 break;
4719 case CCL_STAT_SUSPEND_BY_DST:
4720 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4721 break;
4722 case CCL_STAT_QUIT:
4723 case CCL_STAT_INVALID_CMD:
4724 coding->result = CODING_FINISH_INTERRUPT;
4725 break;
4726 default:
4727 coding->result = CODING_FINISH_NORMAL;
4728 break;
4729 }
4730 return coding->result;
4731 }
4732
4733 /* Decode EOL format of the text at PTR of BYTES length destructively
4734 according to CODING->eol_type. This is called after the CCL
4735 program produced a decoded text at PTR. If we do CRLF->LF
4736 conversion, update CODING->produced and CODING->produced_char. */
4737
4738 static void
4739 decode_eol_post_ccl (coding, ptr, bytes)
4740 struct coding_system *coding;
4741 unsigned char *ptr;
4742 int bytes;
4743 {
4744 Lisp_Object val, saved_coding_symbol;
4745 unsigned char *pend = ptr + bytes;
4746 int dummy;
4747
4748 /* Remember the current coding system symbol. We set it back when
4749 an inconsistent EOL is found so that `last-coding-system-used' is
4750 set to the coding system that doesn't specify EOL conversion. */
4751 saved_coding_symbol = coding->symbol;
4752
4753 coding->spec.ccl.cr_carryover = 0;
4754 if (coding->eol_type == CODING_EOL_UNDECIDED)
4755 {
4756 /* Here, to avoid the call of setup_coding_system, we directly
4757 call detect_eol_type. */
4758 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4759 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4760 coding->eol_type = CODING_EOL_LF;
4761 if (coding->eol_type != CODING_EOL_UNDECIDED)
4762 {
4763 val = Fget (coding->symbol, Qeol_type);
4764 if (VECTORP (val) && XVECTOR (val)->size == 3)
4765 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4766 }
4767 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4768 }
4769
4770 if (coding->eol_type == CODING_EOL_LF
4771 || coding->eol_type == CODING_EOL_UNDECIDED)
4772 {
4773 /* We have nothing to do. */
4774 ptr = pend;
4775 }
4776 else if (coding->eol_type == CODING_EOL_CRLF)
4777 {
4778 unsigned char *pstart = ptr, *p = ptr;
4779
4780 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4781 && *(pend - 1) == '\r')
4782 {
4783 /* If the last character is CR, we can't handle it here
4784 because LF will be in the not-yet-decoded source text.
4785 Record that the CR is not yet processed. */
4786 coding->spec.ccl.cr_carryover = 1;
4787 coding->produced--;
4788 coding->produced_char--;
4789 pend--;
4790 }
4791 while (ptr < pend)
4792 {
4793 if (*ptr == '\r')
4794 {
4795 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4796 {
4797 *p++ = '\n';
4798 ptr += 2;
4799 }
4800 else
4801 {
4802 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4803 goto undo_eol_conversion;
4804 *p++ = *ptr++;
4805 }
4806 }
4807 else if (*ptr == '\n'
4808 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4809 goto undo_eol_conversion;
4810 else
4811 *p++ = *ptr++;
4812 continue;
4813
4814 undo_eol_conversion:
4815 /* We have faced with inconsistent EOL format at PTR.
4816 Convert all LFs before PTR back to CRLFs. */
4817 for (p--, ptr--; p >= pstart; p--)
4818 {
4819 if (*p == '\n')
4820 *ptr-- = '\n', *ptr-- = '\r';
4821 else
4822 *ptr-- = *p;
4823 }
4824 /* If carryover is recorded, cancel it because we don't
4825 convert CRLF anymore. */
4826 if (coding->spec.ccl.cr_carryover)
4827 {
4828 coding->spec.ccl.cr_carryover = 0;
4829 coding->produced++;
4830 coding->produced_char++;
4831 pend++;
4832 }
4833 p = ptr = pend;
4834 coding->eol_type = CODING_EOL_LF;
4835 coding->symbol = saved_coding_symbol;
4836 }
4837 if (p < pend)
4838 {
4839 /* As each two-byte sequence CRLF was converted to LF, (PEND
4840 - P) is the number of deleted characters. */
4841 coding->produced -= pend - p;
4842 coding->produced_char -= pend - p;
4843 }
4844 }
4845 else /* i.e. coding->eol_type == CODING_EOL_CR */
4846 {
4847 unsigned char *p = ptr;
4848
4849 for (; ptr < pend; ptr++)
4850 {
4851 if (*ptr == '\r')
4852 *ptr = '\n';
4853 else if (*ptr == '\n'
4854 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4855 {
4856 for (; p < ptr; p++)
4857 {
4858 if (*p == '\n')
4859 *p = '\r';
4860 }
4861 ptr = pend;
4862 coding->eol_type = CODING_EOL_LF;
4863 coding->symbol = saved_coding_symbol;
4864 }
4865 }
4866 }
4867 }
4868
4869 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4870 decoding, it may detect coding system and format of end-of-line if
4871 those are not yet decided. The source should be unibyte, the
4872 result is multibyte if CODING->dst_multibyte is nonzero, else
4873 unibyte. */
4874
4875 int
4876 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4877 struct coding_system *coding;
4878 const unsigned char *source;
4879 unsigned char *destination;
4880 int src_bytes, dst_bytes;
4881 {
4882 int extra = 0;
4883
4884 if (coding->type == coding_type_undecided)
4885 detect_coding (coding, source, src_bytes);
4886
4887 if (coding->eol_type == CODING_EOL_UNDECIDED
4888 && coding->type != coding_type_ccl)
4889 {
4890 detect_eol (coding, source, src_bytes);
4891 /* We had better recover the original eol format if we
4892 encounter an inconsistent eol format while decoding. */
4893 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4894 }
4895
4896 coding->produced = coding->produced_char = 0;
4897 coding->consumed = coding->consumed_char = 0;
4898 coding->errors = 0;
4899 coding->result = CODING_FINISH_NORMAL;
4900
4901 switch (coding->type)
4902 {
4903 case coding_type_sjis:
4904 decode_coding_sjis_big5 (coding, source, destination,
4905 src_bytes, dst_bytes, 1);
4906 break;
4907
4908 case coding_type_iso2022:
4909 decode_coding_iso2022 (coding, source, destination,
4910 src_bytes, dst_bytes);
4911 break;
4912
4913 case coding_type_big5:
4914 decode_coding_sjis_big5 (coding, source, destination,
4915 src_bytes, dst_bytes, 0);
4916 break;
4917
4918 case coding_type_emacs_mule:
4919 decode_coding_emacs_mule (coding, source, destination,
4920 src_bytes, dst_bytes);
4921 break;
4922
4923 case coding_type_ccl:
4924 if (coding->spec.ccl.cr_carryover)
4925 {
4926 /* Put the CR which was not processed by the previous call
4927 of decode_eol_post_ccl in DESTINATION. It will be
4928 decoded together with the following LF by the call to
4929 decode_eol_post_ccl below. */
4930 *destination = '\r';
4931 coding->produced++;
4932 coding->produced_char++;
4933 dst_bytes--;
4934 extra = coding->spec.ccl.cr_carryover;
4935 }
4936 ccl_coding_driver (coding, source, destination + extra,
4937 src_bytes, dst_bytes, 0);
4938 if (coding->eol_type != CODING_EOL_LF)
4939 {
4940 coding->produced += extra;
4941 coding->produced_char += extra;
4942 decode_eol_post_ccl (coding, destination, coding->produced);
4943 }
4944 break;
4945
4946 default:
4947 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4948 }
4949
4950 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4951 && coding->mode & CODING_MODE_LAST_BLOCK
4952 && coding->consumed == src_bytes)
4953 coding->result = CODING_FINISH_NORMAL;
4954
4955 if (coding->mode & CODING_MODE_LAST_BLOCK
4956 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4957 {
4958 const unsigned char *src = source + coding->consumed;
4959 unsigned char *dst = destination + coding->produced;
4960
4961 src_bytes -= coding->consumed;
4962 coding->errors++;
4963 if (COMPOSING_P (coding))
4964 DECODE_COMPOSITION_END ('1');
4965 while (src_bytes--)
4966 {
4967 int c = *src++;
4968 dst += CHAR_STRING (c, dst);
4969 coding->produced_char++;
4970 }
4971 coding->consumed = coding->consumed_char = src - source;
4972 coding->produced = dst - destination;
4973 coding->result = CODING_FINISH_NORMAL;
4974 }
4975
4976 if (!coding->dst_multibyte)
4977 {
4978 coding->produced = str_as_unibyte (destination, coding->produced);
4979 coding->produced_char = coding->produced;
4980 }
4981
4982 return coding->result;
4983 }
4984
4985 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4986 multibyteness of the source is CODING->src_multibyte, the
4987 multibyteness of the result is always unibyte. */
4988
4989 int
4990 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4991 struct coding_system *coding;
4992 const unsigned char *source;
4993 unsigned char *destination;
4994 int src_bytes, dst_bytes;
4995 {
4996 coding->produced = coding->produced_char = 0;
4997 coding->consumed = coding->consumed_char = 0;
4998 coding->errors = 0;
4999 coding->result = CODING_FINISH_NORMAL;
5000 if (coding->eol_type == CODING_EOL_UNDECIDED)
5001 coding->eol_type = CODING_EOL_LF;
5002
5003 switch (coding->type)
5004 {
5005 case coding_type_sjis:
5006 encode_coding_sjis_big5 (coding, source, destination,
5007 src_bytes, dst_bytes, 1);
5008 break;
5009
5010 case coding_type_iso2022:
5011 encode_coding_iso2022 (coding, source, destination,
5012 src_bytes, dst_bytes);
5013 break;
5014
5015 case coding_type_big5:
5016 encode_coding_sjis_big5 (coding, source, destination,
5017 src_bytes, dst_bytes, 0);
5018 break;
5019
5020 case coding_type_emacs_mule:
5021 encode_coding_emacs_mule (coding, source, destination,
5022 src_bytes, dst_bytes);
5023 break;
5024
5025 case coding_type_ccl:
5026 ccl_coding_driver (coding, source, destination,
5027 src_bytes, dst_bytes, 1);
5028 break;
5029
5030 default:
5031 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5032 }
5033
5034 if (coding->mode & CODING_MODE_LAST_BLOCK
5035 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5036 {
5037 const unsigned char *src = source + coding->consumed;
5038 unsigned char *dst = destination + coding->produced;
5039
5040 if (coding->type == coding_type_iso2022)
5041 ENCODE_RESET_PLANE_AND_REGISTER;
5042 if (COMPOSING_P (coding))
5043 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5044 if (coding->consumed < src_bytes)
5045 {
5046 int len = src_bytes - coding->consumed;
5047
5048 BCOPY_SHORT (src, dst, len);
5049 if (coding->src_multibyte)
5050 len = str_as_unibyte (dst, len);
5051 dst += len;
5052 coding->consumed = src_bytes;
5053 }
5054 coding->produced = coding->produced_char = dst - destination;
5055 coding->result = CODING_FINISH_NORMAL;
5056 }
5057
5058 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5059 && coding->consumed == src_bytes)
5060 coding->result = CODING_FINISH_NORMAL;
5061
5062 return coding->result;
5063 }
5064
5065 /* Scan text in the region between *BEG and *END (byte positions),
5066 skip characters which we don't have to decode by coding system
5067 CODING at the head and tail, then set *BEG and *END to the region
5068 of the text we actually have to convert. The caller should move
5069 the gap out of the region in advance if the region is from a
5070 buffer.
5071
5072 If STR is not NULL, *BEG and *END are indices into STR. */
5073
5074 static void
5075 shrink_decoding_region (beg, end, coding, str)
5076 int *beg, *end;
5077 struct coding_system *coding;
5078 unsigned char *str;
5079 {
5080 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5081 int eol_conversion;
5082 Lisp_Object translation_table;
5083
5084 if (coding->type == coding_type_ccl
5085 || coding->type == coding_type_undecided
5086 || coding->eol_type != CODING_EOL_LF
5087 || !NILP (coding->post_read_conversion)
5088 || coding->composing != COMPOSITION_DISABLED)
5089 {
5090 /* We can't skip any data. */
5091 return;
5092 }
5093 if (coding->type == coding_type_no_conversion
5094 || coding->type == coding_type_raw_text
5095 || coding->type == coding_type_emacs_mule)
5096 {
5097 /* We need no conversion, but don't have to skip any data here.
5098 Decoding routine handles them effectively anyway. */
5099 return;
5100 }
5101
5102 translation_table = coding->translation_table_for_decode;
5103 if (NILP (translation_table) && !NILP (Venable_character_translation))
5104 translation_table = Vstandard_translation_table_for_decode;
5105 if (CHAR_TABLE_P (translation_table))
5106 {
5107 int i;
5108 for (i = 0; i < 128; i++)
5109 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5110 break;
5111 if (i < 128)
5112 /* Some ASCII character should be translated. We give up
5113 shrinking. */
5114 return;
5115 }
5116
5117 if (coding->heading_ascii >= 0)
5118 /* Detection routine has already found how much we can skip at the
5119 head. */
5120 *beg += coding->heading_ascii;
5121
5122 if (str)
5123 {
5124 begp_orig = begp = str + *beg;
5125 endp_orig = endp = str + *end;
5126 }
5127 else
5128 {
5129 begp_orig = begp = BYTE_POS_ADDR (*beg);
5130 endp_orig = endp = begp + *end - *beg;
5131 }
5132
5133 eol_conversion = (coding->eol_type == CODING_EOL_CR
5134 || coding->eol_type == CODING_EOL_CRLF);
5135
5136 switch (coding->type)
5137 {
5138 case coding_type_sjis:
5139 case coding_type_big5:
5140 /* We can skip all ASCII characters at the head. */
5141 if (coding->heading_ascii < 0)
5142 {
5143 if (eol_conversion)
5144 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5145 else
5146 while (begp < endp && *begp < 0x80) begp++;
5147 }
5148 /* We can skip all ASCII characters at the tail except for the
5149 second byte of SJIS or BIG5 code. */
5150 if (eol_conversion)
5151 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5152 else
5153 while (begp < endp && endp[-1] < 0x80) endp--;
5154 /* Do not consider LF as ascii if preceded by CR, since that
5155 confuses eol decoding. */
5156 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5157 endp++;
5158 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5159 endp++;
5160 break;
5161
5162 case coding_type_iso2022:
5163 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5164 /* We can't skip any data. */
5165 break;
5166 if (coding->heading_ascii < 0)
5167 {
5168 /* We can skip all ASCII characters at the head except for a
5169 few control codes. */
5170 while (begp < endp && (c = *begp) < 0x80
5171 && c != ISO_CODE_CR && c != ISO_CODE_SO
5172 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5173 && (!eol_conversion || c != ISO_CODE_LF))
5174 begp++;
5175 }
5176 switch (coding->category_idx)
5177 {
5178 case CODING_CATEGORY_IDX_ISO_8_1:
5179 case CODING_CATEGORY_IDX_ISO_8_2:
5180 /* We can skip all ASCII characters at the tail. */
5181 if (eol_conversion)
5182 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5183 else
5184 while (begp < endp && endp[-1] < 0x80) endp--;
5185 /* Do not consider LF as ascii if preceded by CR, since that
5186 confuses eol decoding. */
5187 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5188 endp++;
5189 break;
5190
5191 case CODING_CATEGORY_IDX_ISO_7:
5192 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5193 {
5194 /* We can skip all characters at the tail except for 8-bit
5195 codes and ESC and the following 2-byte at the tail. */
5196 unsigned char *eight_bit = NULL;
5197
5198 if (eol_conversion)
5199 while (begp < endp
5200 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5201 {
5202 if (!eight_bit && c & 0x80) eight_bit = endp;
5203 endp--;
5204 }
5205 else
5206 while (begp < endp
5207 && (c = endp[-1]) != ISO_CODE_ESC)
5208 {
5209 if (!eight_bit && c & 0x80) eight_bit = endp;
5210 endp--;
5211 }
5212 /* Do not consider LF as ascii if preceded by CR, since that
5213 confuses eol decoding. */
5214 if (begp < endp && endp < endp_orig
5215 && endp[-1] == '\r' && endp[0] == '\n')
5216 endp++;
5217 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5218 {
5219 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5220 /* This is an ASCII designation sequence. We can
5221 surely skip the tail. But, if we have
5222 encountered an 8-bit code, skip only the codes
5223 after that. */
5224 endp = eight_bit ? eight_bit : endp + 2;
5225 else
5226 /* Hmmm, we can't skip the tail. */
5227 endp = endp_orig;
5228 }
5229 else if (eight_bit)
5230 endp = eight_bit;
5231 }
5232 }
5233 break;
5234
5235 default:
5236 abort ();
5237 }
5238 *beg += begp - begp_orig;
5239 *end += endp - endp_orig;
5240 return;
5241 }
5242
5243 /* Like shrink_decoding_region but for encoding. */
5244
5245 static void
5246 shrink_encoding_region (beg, end, coding, str)
5247 int *beg, *end;
5248 struct coding_system *coding;
5249 unsigned char *str;
5250 {
5251 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5252 int eol_conversion;
5253 Lisp_Object translation_table;
5254
5255 if (coding->type == coding_type_ccl
5256 || coding->eol_type == CODING_EOL_CRLF
5257 || coding->eol_type == CODING_EOL_CR
5258 || (coding->cmp_data && coding->cmp_data->used > 0))
5259 {
5260 /* We can't skip any data. */
5261 return;
5262 }
5263 if (coding->type == coding_type_no_conversion
5264 || coding->type == coding_type_raw_text
5265 || coding->type == coding_type_emacs_mule
5266 || coding->type == coding_type_undecided)
5267 {
5268 /* We need no conversion, but don't have to skip any data here.
5269 Encoding routine handles them effectively anyway. */
5270 return;
5271 }
5272
5273 translation_table = coding->translation_table_for_encode;
5274 if (NILP (translation_table) && !NILP (Venable_character_translation))
5275 translation_table = Vstandard_translation_table_for_encode;
5276 if (CHAR_TABLE_P (translation_table))
5277 {
5278 int i;
5279 for (i = 0; i < 128; i++)
5280 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5281 break;
5282 if (i < 128)
5283 /* Some ASCII character should be translated. We give up
5284 shrinking. */
5285 return;
5286 }
5287
5288 if (str)
5289 {
5290 begp_orig = begp = str + *beg;
5291 endp_orig = endp = str + *end;
5292 }
5293 else
5294 {
5295 begp_orig = begp = BYTE_POS_ADDR (*beg);
5296 endp_orig = endp = begp + *end - *beg;
5297 }
5298
5299 eol_conversion = (coding->eol_type == CODING_EOL_CR
5300 || coding->eol_type == CODING_EOL_CRLF);
5301
5302 /* Here, we don't have to check coding->pre_write_conversion because
5303 the caller is expected to have handled it already. */
5304 switch (coding->type)
5305 {
5306 case coding_type_iso2022:
5307 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5308 /* We can't skip any data. */
5309 break;
5310 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5311 {
5312 unsigned char *bol = begp;
5313 while (begp < endp && *begp < 0x80)
5314 {
5315 begp++;
5316 if (begp[-1] == '\n')
5317 bol = begp;
5318 }
5319 begp = bol;
5320 goto label_skip_tail;
5321 }
5322 /* fall down ... */
5323
5324 case coding_type_sjis:
5325 case coding_type_big5:
5326 /* We can skip all ASCII characters at the head and tail. */
5327 if (eol_conversion)
5328 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5329 else
5330 while (begp < endp && *begp < 0x80) begp++;
5331 label_skip_tail:
5332 if (eol_conversion)
5333 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5334 else
5335 while (begp < endp && *(endp - 1) < 0x80) endp--;
5336 break;
5337
5338 default:
5339 abort ();
5340 }
5341
5342 *beg += begp - begp_orig;
5343 *end += endp - endp_orig;
5344 return;
5345 }
5346
5347 /* As shrinking conversion region requires some overhead, we don't try
5348 shrinking if the length of conversion region is less than this
5349 value. */
5350 static int shrink_conversion_region_threshhold = 1024;
5351
5352 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5353 do { \
5354 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5355 { \
5356 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5357 else shrink_decoding_region (beg, end, coding, str); \
5358 } \
5359 } while (0)
5360
5361 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5362 Vlast_coding_system_used and the remaining elements are buffers to
5363 kill. */
5364 static Lisp_Object
5365 code_convert_region_unwind (arg)
5366 Lisp_Object arg;
5367 {
5368 struct gcpro gcpro1;
5369 GCPRO1 (arg);
5370
5371 inhibit_pre_post_conversion = 0;
5372 Vlast_coding_system_used = XCAR (arg);
5373 for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5374 Fkill_buffer (XCAR (arg));
5375
5376 UNGCPRO;
5377 return Qnil;
5378 }
5379
5380 /* Store information about all compositions in the range FROM and TO
5381 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5382 buffer or a string, defaults to the current buffer. */
5383
5384 void
5385 coding_save_composition (coding, from, to, obj)
5386 struct coding_system *coding;
5387 int from, to;
5388 Lisp_Object obj;
5389 {
5390 Lisp_Object prop;
5391 int start, end;
5392
5393 if (coding->composing == COMPOSITION_DISABLED)
5394 return;
5395 if (!coding->cmp_data)
5396 coding_allocate_composition_data (coding, from);
5397 if (!find_composition (from, to, &start, &end, &prop, obj)
5398 || end > to)
5399 return;
5400 if (start < from
5401 && (!find_composition (end, to, &start, &end, &prop, obj)
5402 || end > to))
5403 return;
5404 coding->composing = COMPOSITION_NO;
5405 do
5406 {
5407 if (COMPOSITION_VALID_P (start, end, prop))
5408 {
5409 enum composition_method method = COMPOSITION_METHOD (prop);
5410 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5411 >= COMPOSITION_DATA_SIZE)
5412 coding_allocate_composition_data (coding, from);
5413 /* For relative composition, we remember start and end
5414 positions, for the other compositions, we also remember
5415 components. */
5416 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5417 if (method != COMPOSITION_RELATIVE)
5418 {
5419 /* We must store a*/
5420 Lisp_Object val, ch;
5421
5422 val = COMPOSITION_COMPONENTS (prop);
5423 if (CONSP (val))
5424 while (CONSP (val))
5425 {
5426 ch = XCAR (val), val = XCDR (val);
5427 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5428 }
5429 else if (VECTORP (val) || STRINGP (val))
5430 {
5431 int len = (VECTORP (val)
5432 ? XVECTOR (val)->size : SCHARS (val));
5433 int i;
5434 for (i = 0; i < len; i++)
5435 {
5436 ch = (STRINGP (val)
5437 ? Faref (val, make_number (i))
5438 : XVECTOR (val)->contents[i]);
5439 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5440 }
5441 }
5442 else /* INTEGERP (val) */
5443 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5444 }
5445 CODING_ADD_COMPOSITION_END (coding, end - from);
5446 }
5447 start = end;
5448 }
5449 while (start < to
5450 && find_composition (start, to, &start, &end, &prop, obj)
5451 && end <= to);
5452
5453 /* Make coding->cmp_data point to the first memory block. */
5454 while (coding->cmp_data->prev)
5455 coding->cmp_data = coding->cmp_data->prev;
5456 coding->cmp_data_start = 0;
5457 }
5458
5459 /* Reflect the saved information about compositions to OBJ.
5460 CODING->cmp_data points to a memory block for the information. OBJ
5461 is a buffer or a string, defaults to the current buffer. */
5462
5463 void
5464 coding_restore_composition (coding, obj)
5465 struct coding_system *coding;
5466 Lisp_Object obj;
5467 {
5468 struct composition_data *cmp_data = coding->cmp_data;
5469
5470 if (!cmp_data)
5471 return;
5472
5473 while (cmp_data->prev)
5474 cmp_data = cmp_data->prev;
5475
5476 while (cmp_data)
5477 {
5478 int i;
5479
5480 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5481 i += cmp_data->data[i])
5482 {
5483 int *data = cmp_data->data + i;
5484 enum composition_method method = (enum composition_method) data[3];
5485 Lisp_Object components;
5486
5487 if (data[0] < 0 || i + data[0] > cmp_data->used)
5488 /* Invalid composition data. */
5489 break;
5490
5491 if (method == COMPOSITION_RELATIVE)
5492 components = Qnil;
5493 else
5494 {
5495 int len = data[0] - 4, j;
5496 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5497
5498 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5499 && len % 2 == 0)
5500 len --;
5501 if (len < 1)
5502 /* Invalid composition data. */
5503 break;
5504 for (j = 0; j < len; j++)
5505 args[j] = make_number (data[4 + j]);
5506 components = (method == COMPOSITION_WITH_ALTCHARS
5507 ? Fstring (len, args)
5508 : Fvector (len, args));
5509 }
5510 compose_text (data[1], data[2], components, Qnil, obj);
5511 }
5512 cmp_data = cmp_data->next;
5513 }
5514 }
5515
5516 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5517 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5518 coding system CODING, and return the status code of code conversion
5519 (currently, this value has no meaning).
5520
5521 How many characters (and bytes) are converted to how many
5522 characters (and bytes) are recorded in members of the structure
5523 CODING.
5524
5525 If REPLACE is nonzero, we do various things as if the original text
5526 is deleted and a new text is inserted. See the comments in
5527 replace_range (insdel.c) to know what we are doing.
5528
5529 If REPLACE is zero, it is assumed that the source text is unibyte.
5530 Otherwise, it is assumed that the source text is multibyte. */
5531
5532 int
5533 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5534 int from, from_byte, to, to_byte, encodep, replace;
5535 struct coding_system *coding;
5536 {
5537 int len = to - from, len_byte = to_byte - from_byte;
5538 int nchars_del = 0, nbytes_del = 0;
5539 int require, inserted, inserted_byte;
5540 int head_skip, tail_skip, total_skip = 0;
5541 Lisp_Object saved_coding_symbol;
5542 int first = 1;
5543 unsigned char *src, *dst;
5544 Lisp_Object deletion;
5545 int orig_point = PT, orig_len = len;
5546 int prev_Z;
5547 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5548
5549 deletion = Qnil;
5550 saved_coding_symbol = coding->symbol;
5551
5552 if (from < PT && PT < to)
5553 {
5554 TEMP_SET_PT_BOTH (from, from_byte);
5555 orig_point = from;
5556 }
5557
5558 if (replace)
5559 {
5560 int saved_from = from;
5561 int saved_inhibit_modification_hooks;
5562
5563 prepare_to_modify_buffer (from, to, &from);
5564 if (saved_from != from)
5565 {
5566 to = from + len;
5567 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5568 len_byte = to_byte - from_byte;
5569 }
5570
5571 /* The code conversion routine can not preserve text properties
5572 for now. So, we must remove all text properties in the
5573 region. Here, we must suppress all modification hooks. */
5574 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5575 inhibit_modification_hooks = 1;
5576 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5577 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5578 }
5579
5580 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5581 {
5582 /* We must detect encoding of text and eol format. */
5583
5584 if (from < GPT && to > GPT)
5585 move_gap_both (from, from_byte);
5586 if (coding->type == coding_type_undecided)
5587 {
5588 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5589 if (coding->type == coding_type_undecided)
5590 {
5591 /* It seems that the text contains only ASCII, but we
5592 should not leave it undecided because the deeper
5593 decoding routine (decode_coding) tries to detect the
5594 encodings again in vain. */
5595 coding->type = coding_type_emacs_mule;
5596 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5597 /* As emacs-mule decoder will handle composition, we
5598 need this setting to allocate coding->cmp_data
5599 later. */
5600 coding->composing = COMPOSITION_NO;
5601 }
5602 }
5603 if (coding->eol_type == CODING_EOL_UNDECIDED
5604 && coding->type != coding_type_ccl)
5605 {
5606 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5607 if (coding->eol_type == CODING_EOL_UNDECIDED)
5608 coding->eol_type = CODING_EOL_LF;
5609 /* We had better recover the original eol format if we
5610 encounter an inconsistent eol format while decoding. */
5611 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5612 }
5613 }
5614
5615 /* Now we convert the text. */
5616
5617 /* For encoding, we must process pre-write-conversion in advance. */
5618 if (! inhibit_pre_post_conversion
5619 && encodep
5620 && SYMBOLP (coding->pre_write_conversion)
5621 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5622 {
5623 /* The function in pre-write-conversion may put a new text in a
5624 new buffer. */
5625 struct buffer *prev = current_buffer;
5626 Lisp_Object new;
5627
5628 record_unwind_protect (code_convert_region_unwind,
5629 Fcons (Vlast_coding_system_used, Qnil));
5630 /* We should not call any more pre-write/post-read-conversion
5631 functions while this pre-write-conversion is running. */
5632 inhibit_pre_post_conversion = 1;
5633 call2 (coding->pre_write_conversion,
5634 make_number (from), make_number (to));
5635 inhibit_pre_post_conversion = 0;
5636 /* Discard the unwind protect. */
5637 specpdl_ptr--;
5638
5639 if (current_buffer != prev)
5640 {
5641 len = ZV - BEGV;
5642 new = Fcurrent_buffer ();
5643 set_buffer_internal_1 (prev);
5644 del_range_2 (from, from_byte, to, to_byte, 0);
5645 TEMP_SET_PT_BOTH (from, from_byte);
5646 insert_from_buffer (XBUFFER (new), 1, len, 0);
5647 Fkill_buffer (new);
5648 if (orig_point >= to)
5649 orig_point += len - orig_len;
5650 else if (orig_point > from)
5651 orig_point = from;
5652 orig_len = len;
5653 to = from + len;
5654 from_byte = CHAR_TO_BYTE (from);
5655 to_byte = CHAR_TO_BYTE (to);
5656 len_byte = to_byte - from_byte;
5657 TEMP_SET_PT_BOTH (from, from_byte);
5658 }
5659 }
5660
5661 if (replace)
5662 {
5663 if (! EQ (current_buffer->undo_list, Qt))
5664 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5665 else
5666 {
5667 nchars_del = to - from;
5668 nbytes_del = to_byte - from_byte;
5669 }
5670 }
5671
5672 if (coding->composing != COMPOSITION_DISABLED)
5673 {
5674 if (encodep)
5675 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5676 else
5677 coding_allocate_composition_data (coding, from);
5678 }
5679
5680 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5681 if we must run CCL program or there are compositions to
5682 encode. */
5683 if (coding->type != coding_type_ccl
5684 && (! coding->cmp_data || coding->cmp_data->used == 0))
5685 {
5686 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5687
5688 if (from < GPT && GPT < to)
5689 move_gap_both (from, from_byte);
5690 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5691 if (from_byte == to_byte
5692 && (encodep || NILP (coding->post_read_conversion))
5693 && ! CODING_REQUIRE_FLUSHING (coding))
5694 {
5695 coding->produced = len_byte;
5696 coding->produced_char = len;
5697 if (!replace)
5698 /* We must record and adjust for this new text now. */
5699 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5700 coding_free_composition_data (coding);
5701 return 0;
5702 }
5703
5704 head_skip = from_byte - from_byte_orig;
5705 tail_skip = to_byte_orig - to_byte;
5706 total_skip = head_skip + tail_skip;
5707 from += head_skip;
5708 to -= tail_skip;
5709 len -= total_skip; len_byte -= total_skip;
5710 }
5711
5712 /* For conversion, we must put the gap before the text in addition to
5713 making the gap larger for efficient decoding. The required gap
5714 size starts from 2000 which is the magic number used in make_gap.
5715 But, after one batch of conversion, it will be incremented if we
5716 find that it is not enough . */
5717 require = 2000;
5718
5719 if (GAP_SIZE < require)
5720 make_gap (require - GAP_SIZE);
5721 move_gap_both (from, from_byte);
5722
5723 inserted = inserted_byte = 0;
5724
5725 GAP_SIZE += len_byte;
5726 ZV -= len;
5727 Z -= len;
5728 ZV_BYTE -= len_byte;
5729 Z_BYTE -= len_byte;
5730
5731 if (GPT - BEG < BEG_UNCHANGED)
5732 BEG_UNCHANGED = GPT - BEG;
5733 if (Z - GPT < END_UNCHANGED)
5734 END_UNCHANGED = Z - GPT;
5735
5736 if (!encodep && coding->src_multibyte)
5737 {
5738 /* Decoding routines expects that the source text is unibyte.
5739 We must convert 8-bit characters of multibyte form to
5740 unibyte. */
5741 int len_byte_orig = len_byte;
5742 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5743 if (len_byte < len_byte_orig)
5744 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5745 len_byte);
5746 coding->src_multibyte = 0;
5747 }
5748
5749 for (;;)
5750 {
5751 int result;
5752
5753 /* The buffer memory is now:
5754 +--------+converted-text+---------+-------original-text-------+---+
5755 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5756 |<---------------------- GAP ----------------------->| */
5757 src = GAP_END_ADDR - len_byte;
5758 dst = GPT_ADDR + inserted_byte;
5759
5760 if (encodep)
5761 result = encode_coding (coding, src, dst, len_byte, 0);
5762 else
5763 {
5764 if (coding->composing != COMPOSITION_DISABLED)
5765 coding->cmp_data->char_offset = from + inserted;
5766 result = decode_coding (coding, src, dst, len_byte, 0);
5767 }
5768
5769 /* The buffer memory is now:
5770 +--------+-------converted-text----+--+------original-text----+---+
5771 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5772 |<---------------------- GAP ----------------------->| */
5773
5774 inserted += coding->produced_char;
5775 inserted_byte += coding->produced;
5776 len_byte -= coding->consumed;
5777
5778 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5779 {
5780 coding_allocate_composition_data (coding, from + inserted);
5781 continue;
5782 }
5783
5784 src += coding->consumed;
5785 dst += coding->produced;
5786
5787 if (result == CODING_FINISH_NORMAL)
5788 {
5789 src += len_byte;
5790 break;
5791 }
5792 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5793 {
5794 unsigned char *pend = dst, *p = pend - inserted_byte;
5795 Lisp_Object eol_type;
5796
5797 /* Encode LFs back to the original eol format (CR or CRLF). */
5798 if (coding->eol_type == CODING_EOL_CR)
5799 {
5800 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5801 }
5802 else
5803 {
5804 int count = 0;
5805
5806 while (p < pend) if (*p++ == '\n') count++;
5807 if (src - dst < count)
5808 {
5809 /* We don't have sufficient room for encoding LFs
5810 back to CRLF. We must record converted and
5811 not-yet-converted text back to the buffer
5812 content, enlarge the gap, then record them out of
5813 the buffer contents again. */
5814 int add = len_byte + inserted_byte;
5815
5816 GAP_SIZE -= add;
5817 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5818 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5819 make_gap (count - GAP_SIZE);
5820 GAP_SIZE += add;
5821 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5822 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5823 /* Don't forget to update SRC, DST, and PEND. */
5824 src = GAP_END_ADDR - len_byte;
5825 dst = GPT_ADDR + inserted_byte;
5826 pend = dst;
5827 }
5828 inserted += count;
5829 inserted_byte += count;
5830 coding->produced += count;
5831 p = dst = pend + count;
5832 while (count)
5833 {
5834 *--p = *--pend;
5835 if (*p == '\n') count--, *--p = '\r';
5836 }
5837 }
5838
5839 /* Suppress eol-format conversion in the further conversion. */
5840 coding->eol_type = CODING_EOL_LF;
5841
5842 /* Set the coding system symbol to that for Unix-like EOL. */
5843 eol_type = Fget (saved_coding_symbol, Qeol_type);
5844 if (VECTORP (eol_type)
5845 && XVECTOR (eol_type)->size == 3
5846 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5847 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5848 else
5849 coding->symbol = saved_coding_symbol;
5850
5851 continue;
5852 }
5853 if (len_byte <= 0)
5854 {
5855 if (coding->type != coding_type_ccl
5856 || coding->mode & CODING_MODE_LAST_BLOCK)
5857 break;
5858 coding->mode |= CODING_MODE_LAST_BLOCK;
5859 continue;
5860 }
5861 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5862 {
5863 /* The source text ends in invalid codes. Let's just
5864 make them valid buffer contents, and finish conversion. */
5865 if (multibyte_p)
5866 {
5867 unsigned char *start = dst;
5868
5869 inserted += len_byte;
5870 while (len_byte--)
5871 {
5872 int c = *src++;
5873 dst += CHAR_STRING (c, dst);
5874 }
5875
5876 inserted_byte += dst - start;
5877 }
5878 else
5879 {
5880 inserted += len_byte;
5881 inserted_byte += len_byte;
5882 while (len_byte--)
5883 *dst++ = *src++;
5884 }
5885 break;
5886 }
5887 if (result == CODING_FINISH_INTERRUPT)
5888 {
5889 /* The conversion procedure was interrupted by a user. */
5890 break;
5891 }
5892 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5893 if (coding->consumed < 1)
5894 {
5895 /* It's quite strange to require more memory without
5896 consuming any bytes. Perhaps CCL program bug. */
5897 break;
5898 }
5899 if (first)
5900 {
5901 /* We have just done the first batch of conversion which was
5902 stopped because of insufficient gap. Let's reconsider the
5903 required gap size (i.e. SRT - DST) now.
5904
5905 We have converted ORIG bytes (== coding->consumed) into
5906 NEW bytes (coding->produced). To convert the remaining
5907 LEN bytes, we may need REQUIRE bytes of gap, where:
5908 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5909 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5910 Here, we are sure that NEW >= ORIG. */
5911
5912 if (coding->produced <= coding->consumed)
5913 {
5914 /* This happens because of CCL-based coding system with
5915 eol-type CRLF. */
5916 require = 0;
5917 }
5918 else
5919 {
5920 float ratio = coding->produced - coding->consumed;
5921 ratio /= coding->consumed;
5922 require = len_byte * ratio;
5923 }
5924 first = 0;
5925 }
5926 if ((src - dst) < (require + 2000))
5927 {
5928 /* See the comment above the previous call of make_gap. */
5929 int add = len_byte + inserted_byte;
5930
5931 GAP_SIZE -= add;
5932 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5933 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5934 make_gap (require + 2000);
5935 GAP_SIZE += add;
5936 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5937 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5938 }
5939 }
5940 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5941
5942 if (encodep && coding->dst_multibyte)
5943 {
5944 /* The output is unibyte. We must convert 8-bit characters to
5945 multibyte form. */
5946 if (inserted_byte * 2 > GAP_SIZE)
5947 {
5948 GAP_SIZE -= inserted_byte;
5949 ZV += inserted_byte; Z += inserted_byte;
5950 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5951 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5952 make_gap (inserted_byte - GAP_SIZE);
5953 GAP_SIZE += inserted_byte;
5954 ZV -= inserted_byte; Z -= inserted_byte;
5955 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5956 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5957 }
5958 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5959 }
5960
5961 /* If we shrank the conversion area, adjust it now. */
5962 if (total_skip > 0)
5963 {
5964 if (tail_skip > 0)
5965 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5966 inserted += total_skip; inserted_byte += total_skip;
5967 GAP_SIZE += total_skip;
5968 GPT -= head_skip; GPT_BYTE -= head_skip;
5969 ZV -= total_skip; ZV_BYTE -= total_skip;
5970 Z -= total_skip; Z_BYTE -= total_skip;
5971 from -= head_skip; from_byte -= head_skip;
5972 to += tail_skip; to_byte += tail_skip;
5973 }
5974
5975 prev_Z = Z;
5976 if (! EQ (current_buffer->undo_list, Qt))
5977 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5978 else
5979 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5980 inserted, inserted_byte);
5981 inserted = Z - prev_Z;
5982
5983 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5984 coding_restore_composition (coding, Fcurrent_buffer ());
5985 coding_free_composition_data (coding);
5986
5987 if (! inhibit_pre_post_conversion
5988 && ! encodep && ! NILP (coding->post_read_conversion))
5989 {
5990 Lisp_Object val;
5991 Lisp_Object saved_coding_system;
5992
5993 if (from != PT)
5994 TEMP_SET_PT_BOTH (from, from_byte);
5995 prev_Z = Z;
5996 record_unwind_protect (code_convert_region_unwind,
5997 Fcons (Vlast_coding_system_used, Qnil));
5998 saved_coding_system = Vlast_coding_system_used;
5999 Vlast_coding_system_used = coding->symbol;
6000 /* We should not call any more pre-write/post-read-conversion
6001 functions while this post-read-conversion is running. */
6002 inhibit_pre_post_conversion = 1;
6003 val = call1 (coding->post_read_conversion, make_number (inserted));
6004 inhibit_pre_post_conversion = 0;
6005 coding->symbol = Vlast_coding_system_used;
6006 Vlast_coding_system_used = saved_coding_system;
6007 /* Discard the unwind protect. */
6008 specpdl_ptr--;
6009 CHECK_NUMBER (val);
6010 inserted += Z - prev_Z;
6011 }
6012
6013 if (orig_point >= from)
6014 {
6015 if (orig_point >= from + orig_len)
6016 orig_point += inserted - orig_len;
6017 else
6018 orig_point = from;
6019 TEMP_SET_PT (orig_point);
6020 }
6021
6022 if (replace)
6023 {
6024 signal_after_change (from, to - from, inserted);
6025 update_compositions (from, from + inserted, CHECK_BORDER);
6026 }
6027
6028 {
6029 coding->consumed = to_byte - from_byte;
6030 coding->consumed_char = to - from;
6031 coding->produced = inserted_byte;
6032 coding->produced_char = inserted;
6033 }
6034
6035 return 0;
6036 }
6037
6038 /* Name (or base name) of work buffer for code conversion. */
6039 static Lisp_Object Vcode_conversion_workbuf_name;
6040
6041 /* Set the current buffer to the working buffer prepared for
6042 code-conversion. MULTIBYTE specifies the multibyteness of the
6043 buffer. Return the buffer we set if it must be killed after use.
6044 Otherwise return Qnil. */
6045
6046 static Lisp_Object
6047 set_conversion_work_buffer (multibyte)
6048 int multibyte;
6049 {
6050 Lisp_Object buffer, buffer_to_kill;
6051 struct buffer *buf;
6052
6053 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6054 buf = XBUFFER (buffer);
6055 if (buf == current_buffer)
6056 {
6057 /* As we are already in the work buffer, we must generate a new
6058 buffer for the work. */
6059 Lisp_Object name;
6060
6061 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6062 buffer = buffer_to_kill = Fget_buffer_create (name);
6063 buf = XBUFFER (buffer);
6064 }
6065 else
6066 buffer_to_kill = Qnil;
6067
6068 delete_all_overlays (buf);
6069 buf->directory = current_buffer->directory;
6070 buf->read_only = Qnil;
6071 buf->filename = Qnil;
6072 buf->undo_list = Qt;
6073 eassert (buf->overlays_before == NULL);
6074 eassert (buf->overlays_after == NULL);
6075 set_buffer_internal (buf);
6076 if (BEG != BEGV || Z != ZV)
6077 Fwiden ();
6078 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6079 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6080 return buffer_to_kill;
6081 }
6082
6083 Lisp_Object
6084 run_pre_post_conversion_on_str (str, coding, encodep)
6085 Lisp_Object str;
6086 struct coding_system *coding;
6087 int encodep;
6088 {
6089 int count = SPECPDL_INDEX ();
6090 struct gcpro gcpro1, gcpro2;
6091 int multibyte = STRING_MULTIBYTE (str);
6092 Lisp_Object old_deactivate_mark;
6093 Lisp_Object buffer_to_kill;
6094 Lisp_Object unwind_arg;
6095
6096 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6097 /* It is not crucial to specbind this. */
6098 old_deactivate_mark = Vdeactivate_mark;
6099 GCPRO2 (str, old_deactivate_mark);
6100
6101 /* We must insert the contents of STR as is without
6102 unibyte<->multibyte conversion. For that, we adjust the
6103 multibyteness of the working buffer to that of STR. */
6104 buffer_to_kill = set_conversion_work_buffer (multibyte);
6105 if (NILP (buffer_to_kill))
6106 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6107 else
6108 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6109 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6110
6111 insert_from_string (str, 0, 0,
6112 SCHARS (str), SBYTES (str), 0);
6113 UNGCPRO;
6114 inhibit_pre_post_conversion = 1;
6115 if (encodep)
6116 {
6117 struct buffer *prev = current_buffer;
6118
6119 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6120 if (prev != current_buffer)
6121 /* We must kill the current buffer too. */
6122 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6123 }
6124 else
6125 {
6126 Vlast_coding_system_used = coding->symbol;
6127 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6128 call1 (coding->post_read_conversion, make_number (Z - BEG));
6129 coding->symbol = Vlast_coding_system_used;
6130 }
6131 inhibit_pre_post_conversion = 0;
6132 Vdeactivate_mark = old_deactivate_mark;
6133 str = make_buffer_string (BEG, Z, 1);
6134 return unbind_to (count, str);
6135 }
6136
6137
6138 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6139 text in *STR. *SIZE is the allocated bytes for STR. As it
6140 is intended that this function is called from encode_terminal_code,
6141 the pre-write-conversion function is run by safe_call and thus
6142 "Error during redisplay: ..." is logged when an error occurs.
6143
6144 Store the resulting text in *STR and set CODING->produced_char and
6145 CODING->produced to the number of characters and bytes
6146 respectively. If the size of *STR is too small, enlarge it by
6147 xrealloc and update *STR and *SIZE. */
6148
6149 void
6150 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6151 unsigned char **str;
6152 int *size, nchars, nbytes;
6153 struct coding_system *coding;
6154 {
6155 struct gcpro gcpro1, gcpro2;
6156 struct buffer *cur = current_buffer;
6157 struct buffer *prev;
6158 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6159 Lisp_Object args[3];
6160 Lisp_Object buffer_to_kill;
6161
6162 /* It is not crucial to specbind this. */
6163 old_deactivate_mark = Vdeactivate_mark;
6164 old_last_coding_system_used = Vlast_coding_system_used;
6165 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6166
6167 /* We must insert the contents of STR as is without
6168 unibyte<->multibyte conversion. For that, we adjust the
6169 multibyteness of the working buffer to that of STR. */
6170 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6171 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6172 UNGCPRO;
6173 inhibit_pre_post_conversion = 1;
6174 prev = current_buffer;
6175 args[0] = coding->pre_write_conversion;
6176 args[1] = make_number (BEG);
6177 args[2] = make_number (Z);
6178 safe_call (3, args);
6179 inhibit_pre_post_conversion = 0;
6180 Vdeactivate_mark = old_deactivate_mark;
6181 Vlast_coding_system_used = old_last_coding_system_used;
6182 coding->produced_char = Z - BEG;
6183 coding->produced = Z_BYTE - BEG_BYTE;
6184 if (coding->produced > *size)
6185 {
6186 *size = coding->produced;
6187 *str = xrealloc (*str, *size);
6188 }
6189 if (BEG < GPT && GPT < Z)
6190 move_gap (BEG);
6191 bcopy (BEG_ADDR, *str, coding->produced);
6192 coding->src_multibyte
6193 = ! NILP (current_buffer->enable_multibyte_characters);
6194 if (prev != current_buffer)
6195 Fkill_buffer (Fcurrent_buffer ());
6196 set_buffer_internal (cur);
6197 if (! NILP (buffer_to_kill))
6198 Fkill_buffer (buffer_to_kill);
6199 }
6200
6201
6202 Lisp_Object
6203 decode_coding_string (str, coding, nocopy)
6204 Lisp_Object str;
6205 struct coding_system *coding;
6206 int nocopy;
6207 {
6208 int len;
6209 struct conversion_buffer buf;
6210 int from, to_byte;
6211 Lisp_Object saved_coding_symbol;
6212 int result;
6213 int require_decoding;
6214 int shrinked_bytes = 0;
6215 Lisp_Object newstr;
6216 int consumed, consumed_char, produced, produced_char;
6217
6218 from = 0;
6219 to_byte = SBYTES (str);
6220
6221 saved_coding_symbol = coding->symbol;
6222 coding->src_multibyte = STRING_MULTIBYTE (str);
6223 coding->dst_multibyte = 1;
6224 if (CODING_REQUIRE_DETECTION (coding))
6225 {
6226 /* See the comments in code_convert_region. */
6227 if (coding->type == coding_type_undecided)
6228 {
6229 detect_coding (coding, SDATA (str), to_byte);
6230 if (coding->type == coding_type_undecided)
6231 {
6232 coding->type = coding_type_emacs_mule;
6233 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6234 /* As emacs-mule decoder will handle composition, we
6235 need this setting to allocate coding->cmp_data
6236 later. */
6237 coding->composing = COMPOSITION_NO;
6238 }
6239 }
6240 if (coding->eol_type == CODING_EOL_UNDECIDED
6241 && coding->type != coding_type_ccl)
6242 {
6243 saved_coding_symbol = coding->symbol;
6244 detect_eol (coding, SDATA (str), to_byte);
6245 if (coding->eol_type == CODING_EOL_UNDECIDED)
6246 coding->eol_type = CODING_EOL_LF;
6247 /* We had better recover the original eol format if we
6248 encounter an inconsistent eol format while decoding. */
6249 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6250 }
6251 }
6252
6253 if (coding->type == coding_type_no_conversion
6254 || coding->type == coding_type_raw_text)
6255 coding->dst_multibyte = 0;
6256
6257 require_decoding = CODING_REQUIRE_DECODING (coding);
6258
6259 if (STRING_MULTIBYTE (str))
6260 {
6261 /* Decoding routines expect the source text to be unibyte. */
6262 str = Fstring_as_unibyte (str);
6263 to_byte = SBYTES (str);
6264 nocopy = 1;
6265 coding->src_multibyte = 0;
6266 }
6267
6268 /* Try to skip the heading and tailing ASCIIs. */
6269 if (require_decoding && coding->type != coding_type_ccl)
6270 {
6271 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6272 0);
6273 if (from == to_byte)
6274 require_decoding = 0;
6275 shrinked_bytes = from + (SBYTES (str) - to_byte);
6276 }
6277
6278 if (!require_decoding
6279 && !(SYMBOLP (coding->post_read_conversion)
6280 && !NILP (Ffboundp (coding->post_read_conversion))))
6281 {
6282 coding->consumed = SBYTES (str);
6283 coding->consumed_char = SCHARS (str);
6284 if (coding->dst_multibyte)
6285 {
6286 str = Fstring_as_multibyte (str);
6287 nocopy = 1;
6288 }
6289 coding->produced = SBYTES (str);
6290 coding->produced_char = SCHARS (str);
6291 return (nocopy ? str : Fcopy_sequence (str));
6292 }
6293
6294 if (coding->composing != COMPOSITION_DISABLED)
6295 coding_allocate_composition_data (coding, from);
6296 len = decoding_buffer_size (coding, to_byte - from);
6297 allocate_conversion_buffer (buf, len);
6298
6299 consumed = consumed_char = produced = produced_char = 0;
6300 while (1)
6301 {
6302 result = decode_coding (coding, SDATA (str) + from + consumed,
6303 buf.data + produced, to_byte - from - consumed,
6304 buf.size - produced);
6305 consumed += coding->consumed;
6306 consumed_char += coding->consumed_char;
6307 produced += coding->produced;
6308 produced_char += coding->produced_char;
6309 if (result == CODING_FINISH_NORMAL
6310 || result == CODING_FINISH_INTERRUPT
6311 || (result == CODING_FINISH_INSUFFICIENT_SRC
6312 && coding->consumed == 0))
6313 break;
6314 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6315 coding_allocate_composition_data (coding, from + produced_char);
6316 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6317 extend_conversion_buffer (&buf);
6318 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6319 {
6320 Lisp_Object eol_type;
6321
6322 /* Recover the original EOL format. */
6323 if (coding->eol_type == CODING_EOL_CR)
6324 {
6325 unsigned char *p;
6326 for (p = buf.data; p < buf.data + produced; p++)
6327 if (*p == '\n') *p = '\r';
6328 }
6329 else if (coding->eol_type == CODING_EOL_CRLF)
6330 {
6331 int num_eol = 0;
6332 unsigned char *p0, *p1;
6333 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6334 if (*p0 == '\n') num_eol++;
6335 if (produced + num_eol >= buf.size)
6336 extend_conversion_buffer (&buf);
6337 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6338 {
6339 *--p1 = *--p0;
6340 if (*p0 == '\n') *--p1 = '\r';
6341 }
6342 produced += num_eol;
6343 produced_char += num_eol;
6344 }
6345 /* Suppress eol-format conversion in the further conversion. */
6346 coding->eol_type = CODING_EOL_LF;
6347
6348 /* Set the coding system symbol to that for Unix-like EOL. */
6349 eol_type = Fget (saved_coding_symbol, Qeol_type);
6350 if (VECTORP (eol_type)
6351 && XVECTOR (eol_type)->size == 3
6352 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6353 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6354 else
6355 coding->symbol = saved_coding_symbol;
6356
6357
6358 }
6359 }
6360
6361 coding->consumed = consumed;
6362 coding->consumed_char = consumed_char;
6363 coding->produced = produced;
6364 coding->produced_char = produced_char;
6365
6366 if (coding->dst_multibyte)
6367 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6368 produced + shrinked_bytes);
6369 else
6370 newstr = make_uninit_string (produced + shrinked_bytes);
6371 if (from > 0)
6372 STRING_COPYIN (newstr, 0, SDATA (str), from);
6373 STRING_COPYIN (newstr, from, buf.data, produced);
6374 if (shrinked_bytes > from)
6375 STRING_COPYIN (newstr, from + produced,
6376 SDATA (str) + to_byte,
6377 shrinked_bytes - from);
6378 free_conversion_buffer (&buf);
6379
6380 coding->consumed += shrinked_bytes;
6381 coding->consumed_char += shrinked_bytes;
6382 coding->produced += shrinked_bytes;
6383 coding->produced_char += shrinked_bytes;
6384
6385 if (coding->cmp_data && coding->cmp_data->used)
6386 coding_restore_composition (coding, newstr);
6387 coding_free_composition_data (coding);
6388
6389 if (SYMBOLP (coding->post_read_conversion)
6390 && !NILP (Ffboundp (coding->post_read_conversion)))
6391 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6392
6393 return newstr;
6394 }
6395
6396 Lisp_Object
6397 encode_coding_string (str, coding, nocopy)
6398 Lisp_Object str;
6399 struct coding_system *coding;
6400 int nocopy;
6401 {
6402 int len;
6403 struct conversion_buffer buf;
6404 int from, to, to_byte;
6405 int result;
6406 int shrinked_bytes = 0;
6407 Lisp_Object newstr;
6408 int consumed, consumed_char, produced, produced_char;
6409
6410 if (SYMBOLP (coding->pre_write_conversion)
6411 && !NILP (Ffboundp (coding->pre_write_conversion)))
6412 {
6413 str = run_pre_post_conversion_on_str (str, coding, 1);
6414 /* As STR is just newly generated, we don't have to copy it
6415 anymore. */
6416 nocopy = 1;
6417 }
6418
6419 from = 0;
6420 to = SCHARS (str);
6421 to_byte = SBYTES (str);
6422
6423 /* Encoding routines determine the multibyteness of the source text
6424 by coding->src_multibyte. */
6425 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6426 coding->dst_multibyte = 0;
6427 if (! CODING_REQUIRE_ENCODING (coding))
6428 goto no_need_of_encoding;
6429
6430 if (coding->composing != COMPOSITION_DISABLED)
6431 coding_save_composition (coding, from, to, str);
6432
6433 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6434 if we must run CCL program or there are compositions to
6435 encode. */
6436 if (coding->type != coding_type_ccl
6437 && (! coding->cmp_data || coding->cmp_data->used == 0))
6438 {
6439 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6440 1);
6441 if (from == to_byte)
6442 {
6443 coding_free_composition_data (coding);
6444 goto no_need_of_encoding;
6445 }
6446 shrinked_bytes = from + (SBYTES (str) - to_byte);
6447 }
6448
6449 len = encoding_buffer_size (coding, to_byte - from);
6450 allocate_conversion_buffer (buf, len);
6451
6452 consumed = consumed_char = produced = produced_char = 0;
6453 while (1)
6454 {
6455 result = encode_coding (coding, SDATA (str) + from + consumed,
6456 buf.data + produced, to_byte - from - consumed,
6457 buf.size - produced);
6458 consumed += coding->consumed;
6459 consumed_char += coding->consumed_char;
6460 produced += coding->produced;
6461 produced_char += coding->produced_char;
6462 if (result == CODING_FINISH_NORMAL
6463 || result == CODING_FINISH_INTERRUPT
6464 || (result == CODING_FINISH_INSUFFICIENT_SRC
6465 && coding->consumed == 0))
6466 break;
6467 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6468 extend_conversion_buffer (&buf);
6469 }
6470
6471 coding->consumed = consumed;
6472 coding->consumed_char = consumed_char;
6473 coding->produced = produced;
6474 coding->produced_char = produced_char;
6475
6476 newstr = make_uninit_string (produced + shrinked_bytes);
6477 if (from > 0)
6478 STRING_COPYIN (newstr, 0, SDATA (str), from);
6479 STRING_COPYIN (newstr, from, buf.data, produced);
6480 if (shrinked_bytes > from)
6481 STRING_COPYIN (newstr, from + produced,
6482 SDATA (str) + to_byte,
6483 shrinked_bytes - from);
6484
6485 free_conversion_buffer (&buf);
6486 coding_free_composition_data (coding);
6487
6488 return newstr;
6489
6490 no_need_of_encoding:
6491 coding->consumed = SBYTES (str);
6492 coding->consumed_char = SCHARS (str);
6493 if (STRING_MULTIBYTE (str))
6494 {
6495 if (nocopy)
6496 /* We are sure that STR doesn't contain a multibyte
6497 character. */
6498 STRING_SET_UNIBYTE (str);
6499 else
6500 {
6501 str = Fstring_as_unibyte (str);
6502 nocopy = 1;
6503 }
6504 }
6505 coding->produced = SBYTES (str);
6506 coding->produced_char = SCHARS (str);
6507 return (nocopy ? str : Fcopy_sequence (str));
6508 }
6509
6510 \f
6511 #ifdef emacs
6512 /*** 8. Emacs Lisp library functions ***/
6513
6514 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6515 doc: /* Return t if OBJECT is nil or a coding-system.
6516 See the documentation of `make-coding-system' for information
6517 about coding-system objects. */)
6518 (obj)
6519 Lisp_Object obj;
6520 {
6521 if (NILP (obj))
6522 return Qt;
6523 if (!SYMBOLP (obj))
6524 return Qnil;
6525 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6526 return Qt;
6527 /* Get coding-spec vector for OBJ. */
6528 obj = Fget (obj, Qcoding_system);
6529 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6530 ? Qt : Qnil);
6531 }
6532
6533 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6534 Sread_non_nil_coding_system, 1, 1, 0,
6535 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6536 (prompt)
6537 Lisp_Object prompt;
6538 {
6539 Lisp_Object val;
6540 do
6541 {
6542 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6543 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6544 }
6545 while (SCHARS (val) == 0);
6546 return (Fintern (val, Qnil));
6547 }
6548
6549 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6550 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6551 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6552 (prompt, default_coding_system)
6553 Lisp_Object prompt, default_coding_system;
6554 {
6555 Lisp_Object val;
6556 if (SYMBOLP (default_coding_system))
6557 default_coding_system = SYMBOL_NAME (default_coding_system);
6558 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6559 Qt, Qnil, Qcoding_system_history,
6560 default_coding_system, Qnil);
6561 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6562 }
6563
6564 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6565 1, 1, 0,
6566 doc: /* Check validity of CODING-SYSTEM.
6567 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6568 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6569 The value of this property should be a vector of length 5. */)
6570 (coding_system)
6571 Lisp_Object coding_system;
6572 {
6573 Lisp_Object define_form;
6574
6575 define_form = Fget (coding_system, Qcoding_system_define_form);
6576 if (! NILP (define_form))
6577 {
6578 Fput (coding_system, Qcoding_system_define_form, Qnil);
6579 safe_eval (define_form);
6580 }
6581 if (!NILP (Fcoding_system_p (coding_system)))
6582 return coding_system;
6583 xsignal1 (Qcoding_system_error, coding_system);
6584 }
6585 \f
6586 Lisp_Object
6587 detect_coding_system (src, src_bytes, highest, multibytep)
6588 const unsigned char *src;
6589 int src_bytes, highest;
6590 int multibytep;
6591 {
6592 int coding_mask, eol_type;
6593 Lisp_Object val, tmp;
6594 int dummy;
6595
6596 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6597 eol_type = detect_eol_type (src, src_bytes, &dummy);
6598 if (eol_type == CODING_EOL_INCONSISTENT)
6599 eol_type = CODING_EOL_UNDECIDED;
6600
6601 if (!coding_mask)
6602 {
6603 val = Qundecided;
6604 if (eol_type != CODING_EOL_UNDECIDED)
6605 {
6606 Lisp_Object val2;
6607 val2 = Fget (Qundecided, Qeol_type);
6608 if (VECTORP (val2))
6609 val = XVECTOR (val2)->contents[eol_type];
6610 }
6611 return (highest ? val : Fcons (val, Qnil));
6612 }
6613
6614 /* At first, gather possible coding systems in VAL. */
6615 val = Qnil;
6616 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6617 {
6618 Lisp_Object category_val, category_index;
6619
6620 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6621 category_val = Fsymbol_value (XCAR (tmp));
6622 if (!NILP (category_val)
6623 && NATNUMP (category_index)
6624 && (coding_mask & (1 << XFASTINT (category_index))))
6625 {
6626 val = Fcons (category_val, val);
6627 if (highest)
6628 break;
6629 }
6630 }
6631 if (!highest)
6632 val = Fnreverse (val);
6633
6634 /* Then, replace the elements with subsidiary coding systems. */
6635 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6636 {
6637 if (eol_type != CODING_EOL_UNDECIDED
6638 && eol_type != CODING_EOL_INCONSISTENT)
6639 {
6640 Lisp_Object eol;
6641 eol = Fget (XCAR (tmp), Qeol_type);
6642 if (VECTORP (eol))
6643 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6644 }
6645 }
6646 return (highest ? XCAR (val) : val);
6647 }
6648
6649 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6650 2, 3, 0,
6651 doc: /* Detect how the byte sequence in the region is encoded.
6652 Return a list of possible coding systems used on decoding a byte
6653 sequence containing the bytes in the region between START and END when
6654 the coding system `undecided' is specified. The list is ordered by
6655 priority decided in the current language environment.
6656
6657 If only ASCII characters are found (except for such ISO-2022 control
6658 characters ISO-2022 as ESC), it returns a list of single element
6659 `undecided' or its subsidiary coding system according to a detected
6660 end-of-line format.
6661
6662 If optional argument HIGHEST is non-nil, return the coding system of
6663 highest priority. */)
6664 (start, end, highest)
6665 Lisp_Object start, end, highest;
6666 {
6667 int from, to;
6668 int from_byte, to_byte;
6669 int include_anchor_byte = 0;
6670
6671 CHECK_NUMBER_COERCE_MARKER (start);
6672 CHECK_NUMBER_COERCE_MARKER (end);
6673
6674 validate_region (&start, &end);
6675 from = XINT (start), to = XINT (end);
6676 from_byte = CHAR_TO_BYTE (from);
6677 to_byte = CHAR_TO_BYTE (to);
6678
6679 if (from < GPT && to >= GPT)
6680 move_gap_both (to, to_byte);
6681 /* If we an anchor byte `\0' follows the region, we include it in
6682 the detecting source. Then code detectors can handle the tailing
6683 byte sequence more accurately.
6684
6685 Fix me: This is not a perfect solution. It is better that we
6686 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6687 */
6688 if (to == Z || (to == GPT && GAP_SIZE > 0))
6689 include_anchor_byte = 1;
6690 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6691 to_byte - from_byte + include_anchor_byte,
6692 !NILP (highest),
6693 !NILP (current_buffer
6694 ->enable_multibyte_characters));
6695 }
6696
6697 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6698 1, 2, 0,
6699 doc: /* Detect how the byte sequence in STRING is encoded.
6700 Return a list of possible coding systems used on decoding a byte
6701 sequence containing the bytes in STRING when the coding system
6702 `undecided' is specified. The list is ordered by priority decided in
6703 the current language environment.
6704
6705 If only ASCII characters are found (except for such ISO-2022 control
6706 characters ISO-2022 as ESC), it returns a list of single element
6707 `undecided' or its subsidiary coding system according to a detected
6708 end-of-line format.
6709
6710 If optional argument HIGHEST is non-nil, return the coding system of
6711 highest priority. */)
6712 (string, highest)
6713 Lisp_Object string, highest;
6714 {
6715 CHECK_STRING (string);
6716
6717 return detect_coding_system (SDATA (string),
6718 /* "+ 1" is to include the anchor byte
6719 `\0'. With this, code detectors can
6720 handle the tailing bytes more
6721 accurately. */
6722 SBYTES (string) + 1,
6723 !NILP (highest),
6724 STRING_MULTIBYTE (string));
6725 }
6726
6727 /* Subroutine for Ffind_coding_systems_region_internal.
6728
6729 Return a list of coding systems that safely encode the multibyte
6730 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6731 possible coding systems. If it is nil, it means that we have not
6732 yet found any coding systems.
6733
6734 WORK_TABLE a char-table of which element is set to t once the
6735 element is looked up.
6736
6737 If a non-ASCII single byte char is found, set
6738 *single_byte_char_found to 1. */
6739
6740 static Lisp_Object
6741 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6742 unsigned char *p, *pend;
6743 Lisp_Object safe_codings, work_table;
6744 int *single_byte_char_found;
6745 {
6746 int c, len;
6747 Lisp_Object val, ch;
6748 Lisp_Object prev, tail;
6749
6750 if (NILP (safe_codings))
6751 goto done_safe_codings;
6752 while (p < pend)
6753 {
6754 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6755 p += len;
6756 if (ASCII_BYTE_P (c))
6757 /* We can ignore ASCII characters here. */
6758 continue;
6759 if (SINGLE_BYTE_CHAR_P (c))
6760 *single_byte_char_found = 1;
6761 /* Check the safe coding systems for C. */
6762 ch = make_number (c);
6763 val = Faref (work_table, ch);
6764 if (EQ (val, Qt))
6765 /* This element was already checked. Ignore it. */
6766 continue;
6767 /* Remember that we checked this element. */
6768 Faset (work_table, ch, Qt);
6769
6770 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6771 {
6772 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6773 int encodable;
6774
6775 elt = XCAR (tail);
6776 if (CONSP (XCDR (elt)))
6777 {
6778 /* This entry has this format now:
6779 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6780 ACCEPT-LATIN-EXTRA ) */
6781 val = XCDR (elt);
6782 encodable = ! NILP (Faref (XCAR (val), ch));
6783 if (! encodable)
6784 {
6785 val = XCDR (val);
6786 translation_table = XCAR (val);
6787 hash_table = XCAR (XCDR (val));
6788 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6789 }
6790 }
6791 else
6792 {
6793 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6794 encodable = ! NILP (Faref (XCDR (elt), ch));
6795 if (! encodable)
6796 {
6797 /* Transform the format to:
6798 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6799 ACCEPT-LATIN-EXTRA ) */
6800 val = Fget (XCAR (elt), Qcoding_system);
6801 translation_table
6802 = Fplist_get (AREF (val, 3),
6803 Qtranslation_table_for_encode);
6804 if (SYMBOLP (translation_table))
6805 translation_table = Fget (translation_table,
6806 Qtranslation_table);
6807 hash_table
6808 = (CHAR_TABLE_P (translation_table)
6809 ? XCHAR_TABLE (translation_table)->extras[1]
6810 : Qnil);
6811 accept_latin_extra
6812 = ((EQ (AREF (val, 0), make_number (2))
6813 && VECTORP (AREF (val, 4)))
6814 ? AREF (AREF (val, 4), 16)
6815 : Qnil);
6816 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6817 translation_table, hash_table,
6818 accept_latin_extra));
6819 }
6820 }
6821
6822 if (! encodable
6823 && ((CHAR_TABLE_P (translation_table)
6824 && ! NILP (Faref (translation_table, ch)))
6825 || (HASH_TABLE_P (hash_table)
6826 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6827 || (SINGLE_BYTE_CHAR_P (c)
6828 && ! NILP (accept_latin_extra)
6829 && VECTORP (Vlatin_extra_code_table)
6830 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6831 encodable = 1;
6832 if (encodable)
6833 prev = tail;
6834 else
6835 {
6836 /* Exclude this coding system from SAFE_CODINGS. */
6837 if (EQ (tail, safe_codings))
6838 {
6839 safe_codings = XCDR (safe_codings);
6840 if (NILP (safe_codings))
6841 goto done_safe_codings;
6842 }
6843 else
6844 XSETCDR (prev, XCDR (tail));
6845 }
6846 }
6847 }
6848
6849 done_safe_codings:
6850 /* If the above loop was terminated before P reaches PEND, it means
6851 SAFE_CODINGS was set to nil. If we have not yet found an
6852 non-ASCII single-byte char, check it now. */
6853 if (! *single_byte_char_found)
6854 while (p < pend)
6855 {
6856 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6857 p += len;
6858 if (! ASCII_BYTE_P (c)
6859 && SINGLE_BYTE_CHAR_P (c))
6860 {
6861 *single_byte_char_found = 1;
6862 break;
6863 }
6864 }
6865 return safe_codings;
6866 }
6867
6868 DEFUN ("find-coding-systems-region-internal",
6869 Ffind_coding_systems_region_internal,
6870 Sfind_coding_systems_region_internal, 2, 2, 0,
6871 doc: /* Internal use only. */)
6872 (start, end)
6873 Lisp_Object start, end;
6874 {
6875 Lisp_Object work_table, safe_codings;
6876 int non_ascii_p = 0;
6877 int single_byte_char_found = 0;
6878 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6879
6880 if (STRINGP (start))
6881 {
6882 if (!STRING_MULTIBYTE (start))
6883 return Qt;
6884 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6885 p2 = p2end = p1end;
6886 if (SCHARS (start) != SBYTES (start))
6887 non_ascii_p = 1;
6888 }
6889 else
6890 {
6891 int from, to, stop;
6892
6893 CHECK_NUMBER_COERCE_MARKER (start);
6894 CHECK_NUMBER_COERCE_MARKER (end);
6895 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6896 args_out_of_range (start, end);
6897 if (NILP (current_buffer->enable_multibyte_characters))
6898 return Qt;
6899 from = CHAR_TO_BYTE (XINT (start));
6900 to = CHAR_TO_BYTE (XINT (end));
6901 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6902 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6903 if (stop == to)
6904 p2 = p2end = p1end;
6905 else
6906 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6907 if (XINT (end) - XINT (start) != to - from)
6908 non_ascii_p = 1;
6909 }
6910
6911 if (!non_ascii_p)
6912 {
6913 /* We are sure that the text contains no multibyte character.
6914 Check if it contains eight-bit-graphic. */
6915 p = p1;
6916 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6917 if (p == p1end)
6918 {
6919 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6920 if (p == p2end)
6921 return Qt;
6922 }
6923 }
6924
6925 /* The text contains non-ASCII characters. */
6926
6927 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6928 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6929
6930 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6931 &single_byte_char_found);
6932 if (p2 < p2end)
6933 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6934 &single_byte_char_found);
6935 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6936 safe_codings = Qt;
6937 else
6938 {
6939 /* Turn safe_codings to a list of coding systems... */
6940 Lisp_Object val;
6941
6942 if (single_byte_char_found)
6943 /* ... and append these for eight-bit chars. */
6944 val = Fcons (Qraw_text,
6945 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6946 else
6947 /* ... and append generic coding systems. */
6948 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6949
6950 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6951 val = Fcons (XCAR (XCAR (safe_codings)), val);
6952 safe_codings = val;
6953 }
6954
6955 return safe_codings;
6956 }
6957
6958
6959 /* Search from position POS for such characters that are unencodable
6960 accoding to SAFE_CHARS, and return a list of their positions. P
6961 points where in the memory the character at POS exists. Limit the
6962 search at PEND or when Nth unencodable characters are found.
6963
6964 If SAFE_CHARS is a char table, an element for an unencodable
6965 character is nil.
6966
6967 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6968
6969 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6970 eight-bit-graphic characters are unencodable. */
6971
6972 static Lisp_Object
6973 unencodable_char_position (safe_chars, pos, p, pend, n)
6974 Lisp_Object safe_chars;
6975 int pos;
6976 unsigned char *p, *pend;
6977 int n;
6978 {
6979 Lisp_Object pos_list;
6980
6981 pos_list = Qnil;
6982 while (p < pend)
6983 {
6984 int len;
6985 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6986
6987 if (c >= 128
6988 && (CHAR_TABLE_P (safe_chars)
6989 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6990 : (NILP (safe_chars) || c < 256)))
6991 {
6992 pos_list = Fcons (make_number (pos), pos_list);
6993 if (--n <= 0)
6994 break;
6995 }
6996 pos++;
6997 p += len;
6998 }
6999 return Fnreverse (pos_list);
7000 }
7001
7002
7003 DEFUN ("unencodable-char-position", Funencodable_char_position,
7004 Sunencodable_char_position, 3, 5, 0,
7005 doc: /*
7006 Return position of first un-encodable character in a region.
7007 START and END specfiy the region and CODING-SYSTEM specifies the
7008 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7009
7010 If optional 4th argument COUNT is non-nil, it specifies at most how
7011 many un-encodable characters to search. In this case, the value is a
7012 list of positions.
7013
7014 If optional 5th argument STRING is non-nil, it is a string to search
7015 for un-encodable characters. In that case, START and END are indexes
7016 to the string. */)
7017 (start, end, coding_system, count, string)
7018 Lisp_Object start, end, coding_system, count, string;
7019 {
7020 int n;
7021 Lisp_Object safe_chars;
7022 struct coding_system coding;
7023 Lisp_Object positions;
7024 int from, to;
7025 unsigned char *p, *pend;
7026
7027 if (NILP (string))
7028 {
7029 validate_region (&start, &end);
7030 from = XINT (start);
7031 to = XINT (end);
7032 if (NILP (current_buffer->enable_multibyte_characters))
7033 return Qnil;
7034 p = CHAR_POS_ADDR (from);
7035 if (to == GPT)
7036 pend = GPT_ADDR;
7037 else
7038 pend = CHAR_POS_ADDR (to);
7039 }
7040 else
7041 {
7042 CHECK_STRING (string);
7043 CHECK_NATNUM (start);
7044 CHECK_NATNUM (end);
7045 from = XINT (start);
7046 to = XINT (end);
7047 if (from > to
7048 || to > SCHARS (string))
7049 args_out_of_range_3 (string, start, end);
7050 if (! STRING_MULTIBYTE (string))
7051 return Qnil;
7052 p = SDATA (string) + string_char_to_byte (string, from);
7053 pend = SDATA (string) + string_char_to_byte (string, to);
7054 }
7055
7056 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7057
7058 if (NILP (count))
7059 n = 1;
7060 else
7061 {
7062 CHECK_NATNUM (count);
7063 n = XINT (count);
7064 }
7065
7066 if (coding.type == coding_type_no_conversion
7067 || coding.type == coding_type_raw_text)
7068 return Qnil;
7069
7070 if (coding.type == coding_type_undecided)
7071 safe_chars = Qnil;
7072 else
7073 safe_chars = coding_safe_chars (coding_system);
7074
7075 if (STRINGP (string)
7076 || from >= GPT || to <= GPT)
7077 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7078 else
7079 {
7080 Lisp_Object args[2];
7081
7082 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7083 n -= XINT (Flength (args[0]));
7084 if (n <= 0)
7085 positions = args[0];
7086 else
7087 {
7088 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7089 pend, n);
7090 positions = Fappend (2, args);
7091 }
7092 }
7093
7094 return (NILP (count) ? Fcar (positions) : positions);
7095 }
7096
7097
7098 Lisp_Object
7099 code_convert_region1 (start, end, coding_system, encodep)
7100 Lisp_Object start, end, coding_system;
7101 int encodep;
7102 {
7103 struct coding_system coding;
7104 int from, to;
7105
7106 CHECK_NUMBER_COERCE_MARKER (start);
7107 CHECK_NUMBER_COERCE_MARKER (end);
7108 CHECK_SYMBOL (coding_system);
7109
7110 validate_region (&start, &end);
7111 from = XFASTINT (start);
7112 to = XFASTINT (end);
7113
7114 if (NILP (coding_system))
7115 return make_number (to - from);
7116
7117 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7118 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7119
7120 coding.mode |= CODING_MODE_LAST_BLOCK;
7121 coding.src_multibyte = coding.dst_multibyte
7122 = !NILP (current_buffer->enable_multibyte_characters);
7123 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7124 &coding, encodep, 1);
7125 Vlast_coding_system_used = coding.symbol;
7126 return make_number (coding.produced_char);
7127 }
7128
7129 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7130 3, 3, "r\nzCoding system: ",
7131 doc: /* Decode the current region from the specified coding system.
7132 When called from a program, takes three arguments:
7133 START, END, and CODING-SYSTEM. START and END are buffer positions.
7134 This function sets `last-coding-system-used' to the precise coding system
7135 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7136 not fully specified.)
7137 It returns the length of the decoded text. */)
7138 (start, end, coding_system)
7139 Lisp_Object start, end, coding_system;
7140 {
7141 return code_convert_region1 (start, end, coding_system, 0);
7142 }
7143
7144 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7145 3, 3, "r\nzCoding system: ",
7146 doc: /* Encode the current region into the specified coding system.
7147 When called from a program, takes three arguments:
7148 START, END, and CODING-SYSTEM. START and END are buffer positions.
7149 This function sets `last-coding-system-used' to the precise coding system
7150 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7151 not fully specified.)
7152 It returns the length of the encoded text. */)
7153 (start, end, coding_system)
7154 Lisp_Object start, end, coding_system;
7155 {
7156 return code_convert_region1 (start, end, coding_system, 1);
7157 }
7158
7159 Lisp_Object
7160 code_convert_string1 (string, coding_system, nocopy, encodep)
7161 Lisp_Object string, coding_system, nocopy;
7162 int encodep;
7163 {
7164 struct coding_system coding;
7165
7166 CHECK_STRING (string);
7167 CHECK_SYMBOL (coding_system);
7168
7169 if (NILP (coding_system))
7170 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7171
7172 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7173 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7174
7175 coding.mode |= CODING_MODE_LAST_BLOCK;
7176 string = (encodep
7177 ? encode_coding_string (string, &coding, !NILP (nocopy))
7178 : decode_coding_string (string, &coding, !NILP (nocopy)));
7179 Vlast_coding_system_used = coding.symbol;
7180
7181 return string;
7182 }
7183
7184 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7185 2, 3, 0,
7186 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7187 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7188 if the decoding operation is trivial.
7189 This function sets `last-coding-system-used' to the precise coding system
7190 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7191 not fully specified.) */)
7192 (string, coding_system, nocopy)
7193 Lisp_Object string, coding_system, nocopy;
7194 {
7195 return code_convert_string1 (string, coding_system, nocopy, 0);
7196 }
7197
7198 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7199 2, 3, 0,
7200 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the encoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.) */)
7206 (string, coding_system, nocopy)
7207 Lisp_Object string, coding_system, nocopy;
7208 {
7209 return code_convert_string1 (string, coding_system, nocopy, 1);
7210 }
7211
7212 /* Encode or decode STRING according to CODING_SYSTEM.
7213 Do not set Vlast_coding_system_used.
7214
7215 This function is called only from macros DECODE_FILE and
7216 ENCODE_FILE, thus we ignore character composition. */
7217
7218 Lisp_Object
7219 code_convert_string_norecord (string, coding_system, encodep)
7220 Lisp_Object string, coding_system;
7221 int encodep;
7222 {
7223 struct coding_system coding;
7224
7225 CHECK_STRING (string);
7226 CHECK_SYMBOL (coding_system);
7227
7228 if (NILP (coding_system))
7229 return string;
7230
7231 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7232 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7233
7234 coding.composing = COMPOSITION_DISABLED;
7235 coding.mode |= CODING_MODE_LAST_BLOCK;
7236 return (encodep
7237 ? encode_coding_string (string, &coding, 1)
7238 : decode_coding_string (string, &coding, 1));
7239 }
7240 \f
7241 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7242 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7243 Return the corresponding character. */)
7244 (code)
7245 Lisp_Object code;
7246 {
7247 unsigned char c1, c2, s1, s2;
7248 Lisp_Object val;
7249
7250 CHECK_NUMBER (code);
7251 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7252 if (s1 == 0)
7253 {
7254 if (s2 < 0x80)
7255 XSETFASTINT (val, s2);
7256 else if (s2 >= 0xA0 || s2 <= 0xDF)
7257 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7258 else
7259 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7260 }
7261 else
7262 {
7263 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7264 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7265 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7266 DECODE_SJIS (s1, s2, c1, c2);
7267 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7268 }
7269 return val;
7270 }
7271
7272 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7273 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7274 Return the corresponding code in SJIS. */)
7275 (ch)
7276 Lisp_Object ch;
7277 {
7278 int charset, c1, c2, s1, s2;
7279 Lisp_Object val;
7280
7281 CHECK_NUMBER (ch);
7282 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7283 if (charset == CHARSET_ASCII)
7284 {
7285 val = ch;
7286 }
7287 else if (charset == charset_jisx0208
7288 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7289 {
7290 ENCODE_SJIS (c1, c2, s1, s2);
7291 XSETFASTINT (val, (s1 << 8) | s2);
7292 }
7293 else if (charset == charset_katakana_jisx0201
7294 && c1 > 0x20 && c2 < 0xE0)
7295 {
7296 XSETFASTINT (val, c1 | 0x80);
7297 }
7298 else
7299 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7300 return val;
7301 }
7302
7303 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7304 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7305 Return the corresponding character. */)
7306 (code)
7307 Lisp_Object code;
7308 {
7309 int charset;
7310 unsigned char b1, b2, c1, c2;
7311 Lisp_Object val;
7312
7313 CHECK_NUMBER (code);
7314 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7315 if (b1 == 0)
7316 {
7317 if (b2 >= 0x80)
7318 error ("Invalid BIG5 code: %x", XFASTINT (code));
7319 val = code;
7320 }
7321 else
7322 {
7323 if ((b1 < 0xA1 || b1 > 0xFE)
7324 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7325 error ("Invalid BIG5 code: %x", XFASTINT (code));
7326 DECODE_BIG5 (b1, b2, charset, c1, c2);
7327 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7328 }
7329 return val;
7330 }
7331
7332 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7333 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7334 Return the corresponding character code in Big5. */)
7335 (ch)
7336 Lisp_Object ch;
7337 {
7338 int charset, c1, c2, b1, b2;
7339 Lisp_Object val;
7340
7341 CHECK_NUMBER (ch);
7342 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7343 if (charset == CHARSET_ASCII)
7344 {
7345 val = ch;
7346 }
7347 else if ((charset == charset_big5_1
7348 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7349 || (charset == charset_big5_2
7350 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7351 {
7352 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7353 XSETFASTINT (val, (b1 << 8) | b2);
7354 }
7355 else
7356 error ("Can't encode to Big5: %d", XFASTINT (ch));
7357 return val;
7358 }
7359 \f
7360 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7361 Sset_terminal_coding_system_internal, 1, 2, 0,
7362 doc: /* Internal use only. */)
7363 (coding_system, terminal)
7364 Lisp_Object coding_system;
7365 Lisp_Object terminal;
7366 {
7367 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7368 CHECK_SYMBOL (coding_system);
7369 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7370 /* We had better not send unsafe characters to terminal. */
7371 terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7372 /* Character composition should be disabled. */
7373 terminal_coding->composing = COMPOSITION_DISABLED;
7374 /* Error notification should be suppressed. */
7375 terminal_coding->suppress_error = 1;
7376 terminal_coding->src_multibyte = 1;
7377 terminal_coding->dst_multibyte = 0;
7378 return Qnil;
7379 }
7380
7381 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7382 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7383 doc: /* Internal use only. */)
7384 (coding_system)
7385 Lisp_Object coding_system;
7386 {
7387 CHECK_SYMBOL (coding_system);
7388 setup_coding_system (Fcheck_coding_system (coding_system),
7389 &safe_terminal_coding);
7390 /* Character composition should be disabled. */
7391 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7392 /* Error notification should be suppressed. */
7393 safe_terminal_coding.suppress_error = 1;
7394 safe_terminal_coding.src_multibyte = 1;
7395 safe_terminal_coding.dst_multibyte = 0;
7396 return Qnil;
7397 }
7398
7399 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7400 Sterminal_coding_system, 0, 1, 0,
7401 doc: /* Return coding system specified for terminal output on the given terminal.
7402 TERMINAL may be a terminal id, a frame, or nil for the selected
7403 frame's terminal device. */)
7404 (terminal)
7405 Lisp_Object terminal;
7406 {
7407 return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7408 }
7409
7410 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7411 Sset_keyboard_coding_system_internal, 1, 2, 0,
7412 doc: /* Internal use only. */)
7413 (coding_system, terminal)
7414 Lisp_Object coding_system;
7415 Lisp_Object terminal;
7416 {
7417 struct terminal *t = get_terminal (terminal, 1);
7418 CHECK_SYMBOL (coding_system);
7419
7420 setup_coding_system (Fcheck_coding_system (coding_system),
7421 TERMINAL_KEYBOARD_CODING (t));
7422 /* Character composition should be disabled. */
7423 TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7424 return Qnil;
7425 }
7426
7427 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7428 Skeyboard_coding_system, 0, 1, 0,
7429 doc: /* Return coding system for decoding keyboard input on TERMINAL.
7430 TERMINAL may be a terminal id, a frame, or nil for the selected
7431 frame's terminal device. */)
7432 (terminal)
7433 Lisp_Object terminal;
7434 {
7435 return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7436 }
7437
7438 \f
7439 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7440 Sfind_operation_coding_system, 1, MANY, 0,
7441 doc: /* Choose a coding system for an operation based on the target name.
7442 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7443 DECODING-SYSTEM is the coding system to use for decoding
7444 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7445 for encoding (in case OPERATION does encoding).
7446
7447 The first argument OPERATION specifies an I/O primitive:
7448 For file I/O, `insert-file-contents' or `write-region'.
7449 For process I/O, `call-process', `call-process-region', or `start-process'.
7450 For network I/O, `open-network-stream'.
7451
7452 The remaining arguments should be the same arguments that were passed
7453 to the primitive. Depending on which primitive, one of those arguments
7454 is selected as the TARGET. For example, if OPERATION does file I/O,
7455 whichever argument specifies the file name is TARGET.
7456
7457 TARGET has a meaning which depends on OPERATION:
7458 For file I/O, TARGET is a file name (except for the special case below).
7459 For process I/O, TARGET is a process name.
7460 For network I/O, TARGET is a service name or a port number
7461
7462 This function looks up what specified for TARGET in,
7463 `file-coding-system-alist', `process-coding-system-alist',
7464 or `network-coding-system-alist' depending on OPERATION.
7465 They may specify a coding system, a cons of coding systems,
7466 or a function symbol to call.
7467 In the last case, we call the function with one argument,
7468 which is a list of all the arguments given to this function.
7469
7470 If OPERATION is `insert-file-contents', the argument corresponding to
7471 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7472 file name to look up, and BUFFER is a buffer that contains the file's
7473 contents (not yet decoded). If `file-coding-system-alist' specifies a
7474 function to call for FILENAME, that function should examine the
7475 contents of BUFFER instead of reading the file.
7476
7477 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7478 (nargs, args)
7479 int nargs;
7480 Lisp_Object *args;
7481 {
7482 Lisp_Object operation, target_idx, target, val;
7483 register Lisp_Object chain;
7484
7485 if (nargs < 2)
7486 error ("Too few arguments");
7487 operation = args[0];
7488 if (!SYMBOLP (operation)
7489 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7490 error ("Invalid first argument");
7491 if (nargs < 1 + XINT (target_idx))
7492 error ("Too few arguments for operation: %s",
7493 SDATA (SYMBOL_NAME (operation)));
7494 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7495 argument to write-region) is string, it must be treated as a
7496 target file name. */
7497 if (EQ (operation, Qwrite_region)
7498 && nargs > 5
7499 && STRINGP (args[5]))
7500 target_idx = make_number (4);
7501 target = args[XINT (target_idx) + 1];
7502 if (!(STRINGP (target)
7503 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7504 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7505 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7506 error ("Invalid argument %d", XINT (target_idx) + 1);
7507 if (CONSP (target))
7508 target = XCAR (target);
7509
7510 chain = ((EQ (operation, Qinsert_file_contents)
7511 || EQ (operation, Qwrite_region))
7512 ? Vfile_coding_system_alist
7513 : (EQ (operation, Qopen_network_stream)
7514 ? Vnetwork_coding_system_alist
7515 : Vprocess_coding_system_alist));
7516 if (NILP (chain))
7517 return Qnil;
7518
7519 for (; CONSP (chain); chain = XCDR (chain))
7520 {
7521 Lisp_Object elt;
7522 elt = XCAR (chain);
7523
7524 if (CONSP (elt)
7525 && ((STRINGP (target)
7526 && STRINGP (XCAR (elt))
7527 && fast_string_match (XCAR (elt), target) >= 0)
7528 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7529 {
7530 val = XCDR (elt);
7531 /* Here, if VAL is both a valid coding system and a valid
7532 function symbol, we return VAL as a coding system. */
7533 if (CONSP (val))
7534 return val;
7535 if (! SYMBOLP (val))
7536 return Qnil;
7537 if (! NILP (Fcoding_system_p (val)))
7538 return Fcons (val, val);
7539 if (! NILP (Ffboundp (val)))
7540 {
7541 /* We use call1 rather than safe_call1
7542 so as to get bug reports about functions called here
7543 which don't handle the current interface. */
7544 val = call1 (val, Flist (nargs, args));
7545 if (CONSP (val))
7546 return val;
7547 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7548 return Fcons (val, val);
7549 }
7550 return Qnil;
7551 }
7552 }
7553 return Qnil;
7554 }
7555
7556 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7557 Supdate_coding_systems_internal, 0, 0, 0,
7558 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7559 When values of any coding categories are changed, you must
7560 call this function. */)
7561 ()
7562 {
7563 int i;
7564
7565 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7566 {
7567 Lisp_Object val;
7568
7569 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7570 if (!NILP (val))
7571 {
7572 if (! coding_system_table[i])
7573 coding_system_table[i] = ((struct coding_system *)
7574 xmalloc (sizeof (struct coding_system)));
7575 setup_coding_system (val, coding_system_table[i]);
7576 }
7577 else if (coding_system_table[i])
7578 {
7579 xfree (coding_system_table[i]);
7580 coding_system_table[i] = NULL;
7581 }
7582 }
7583
7584 return Qnil;
7585 }
7586
7587 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7588 Sset_coding_priority_internal, 0, 0, 0,
7589 doc: /* Update internal database for the current value of `coding-category-list'.
7590 This function is internal use only. */)
7591 ()
7592 {
7593 int i = 0, idx;
7594 Lisp_Object val;
7595
7596 val = Vcoding_category_list;
7597
7598 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7599 {
7600 if (! SYMBOLP (XCAR (val)))
7601 break;
7602 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7603 if (idx >= CODING_CATEGORY_IDX_MAX)
7604 break;
7605 coding_priorities[i++] = (1 << idx);
7606 val = XCDR (val);
7607 }
7608 /* If coding-category-list is valid and contains all coding
7609 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7610 the following code saves Emacs from crashing. */
7611 while (i < CODING_CATEGORY_IDX_MAX)
7612 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7613
7614 return Qnil;
7615 }
7616
7617 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7618 Sdefine_coding_system_internal, 1, 1, 0,
7619 doc: /* Register CODING-SYSTEM as a base coding system.
7620 This function is internal use only. */)
7621 (coding_system)
7622 Lisp_Object coding_system;
7623 {
7624 Lisp_Object safe_chars, slot;
7625
7626 if (NILP (Fcheck_coding_system (coding_system)))
7627 xsignal1 (Qcoding_system_error, coding_system);
7628
7629 safe_chars = coding_safe_chars (coding_system);
7630 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7631 error ("No valid safe-chars property for %s",
7632 SDATA (SYMBOL_NAME (coding_system)));
7633
7634 if (EQ (safe_chars, Qt))
7635 {
7636 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7637 XSETCAR (Vcoding_system_safe_chars,
7638 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7639 }
7640 else
7641 {
7642 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7643 if (NILP (slot))
7644 XSETCDR (Vcoding_system_safe_chars,
7645 nconc2 (XCDR (Vcoding_system_safe_chars),
7646 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7647 else
7648 XSETCDR (slot, safe_chars);
7649 }
7650 return Qnil;
7651 }
7652
7653 #endif /* emacs */
7654
7655 \f
7656 /*** 9. Post-amble ***/
7657
7658 void
7659 init_coding_once ()
7660 {
7661 int i;
7662
7663 /* Emacs' internal format specific initialize routine. */
7664 for (i = 0; i <= 0x20; i++)
7665 emacs_code_class[i] = EMACS_control_code;
7666 emacs_code_class[0x0A] = EMACS_linefeed_code;
7667 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7668 for (i = 0x21 ; i < 0x7F; i++)
7669 emacs_code_class[i] = EMACS_ascii_code;
7670 emacs_code_class[0x7F] = EMACS_control_code;
7671 for (i = 0x80; i < 0xFF; i++)
7672 emacs_code_class[i] = EMACS_invalid_code;
7673 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7674 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7675 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7676 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7677
7678 /* ISO2022 specific initialize routine. */
7679 for (i = 0; i < 0x20; i++)
7680 iso_code_class[i] = ISO_control_0;
7681 for (i = 0x21; i < 0x7F; i++)
7682 iso_code_class[i] = ISO_graphic_plane_0;
7683 for (i = 0x80; i < 0xA0; i++)
7684 iso_code_class[i] = ISO_control_1;
7685 for (i = 0xA1; i < 0xFF; i++)
7686 iso_code_class[i] = ISO_graphic_plane_1;
7687 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7688 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7689 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7690 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7691 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7692 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7693 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7694 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7695 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7696 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7697
7698 setup_coding_system (Qnil, &safe_terminal_coding);
7699 setup_coding_system (Qnil, &default_buffer_file_coding);
7700
7701 bzero (coding_system_table, sizeof coding_system_table);
7702
7703 bzero (ascii_skip_code, sizeof ascii_skip_code);
7704 for (i = 0; i < 128; i++)
7705 ascii_skip_code[i] = 1;
7706
7707 #if defined (MSDOS) || defined (WINDOWSNT)
7708 system_eol_type = CODING_EOL_CRLF;
7709 #else
7710 system_eol_type = CODING_EOL_LF;
7711 #endif
7712
7713 inhibit_pre_post_conversion = 0;
7714 }
7715
7716 #ifdef emacs
7717
7718 void
7719 syms_of_coding ()
7720 {
7721 staticpro (&Vcode_conversion_workbuf_name);
7722 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7723
7724 Qtarget_idx = intern ("target-idx");
7725 staticpro (&Qtarget_idx);
7726
7727 Qcoding_system_history = intern ("coding-system-history");
7728 staticpro (&Qcoding_system_history);
7729 Fset (Qcoding_system_history, Qnil);
7730
7731 /* Target FILENAME is the first argument. */
7732 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7733 /* Target FILENAME is the third argument. */
7734 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7735
7736 Qcall_process = intern ("call-process");
7737 staticpro (&Qcall_process);
7738 /* Target PROGRAM is the first argument. */
7739 Fput (Qcall_process, Qtarget_idx, make_number (0));
7740
7741 Qcall_process_region = intern ("call-process-region");
7742 staticpro (&Qcall_process_region);
7743 /* Target PROGRAM is the third argument. */
7744 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7745
7746 Qstart_process = intern ("start-process");
7747 staticpro (&Qstart_process);
7748 /* Target PROGRAM is the third argument. */
7749 Fput (Qstart_process, Qtarget_idx, make_number (2));
7750
7751 Qopen_network_stream = intern ("open-network-stream");
7752 staticpro (&Qopen_network_stream);
7753 /* Target SERVICE is the fourth argument. */
7754 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7755
7756 Qcoding_system = intern ("coding-system");
7757 staticpro (&Qcoding_system);
7758
7759 Qeol_type = intern ("eol-type");
7760 staticpro (&Qeol_type);
7761
7762 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7763 staticpro (&Qbuffer_file_coding_system);
7764
7765 Qpost_read_conversion = intern ("post-read-conversion");
7766 staticpro (&Qpost_read_conversion);
7767
7768 Qpre_write_conversion = intern ("pre-write-conversion");
7769 staticpro (&Qpre_write_conversion);
7770
7771 Qno_conversion = intern ("no-conversion");
7772 staticpro (&Qno_conversion);
7773
7774 Qundecided = intern ("undecided");
7775 staticpro (&Qundecided);
7776
7777 Qcoding_system_p = intern ("coding-system-p");
7778 staticpro (&Qcoding_system_p);
7779
7780 Qcoding_system_error = intern ("coding-system-error");
7781 staticpro (&Qcoding_system_error);
7782
7783 Fput (Qcoding_system_error, Qerror_conditions,
7784 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7785 Fput (Qcoding_system_error, Qerror_message,
7786 build_string ("Invalid coding system"));
7787
7788 Qcoding_category = intern ("coding-category");
7789 staticpro (&Qcoding_category);
7790 Qcoding_category_index = intern ("coding-category-index");
7791 staticpro (&Qcoding_category_index);
7792
7793 Vcoding_category_table
7794 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7795 staticpro (&Vcoding_category_table);
7796 {
7797 int i;
7798 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7799 {
7800 XVECTOR (Vcoding_category_table)->contents[i]
7801 = intern (coding_category_name[i]);
7802 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7803 Qcoding_category_index, make_number (i));
7804 }
7805 }
7806
7807 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7808 staticpro (&Vcoding_system_safe_chars);
7809
7810 Qtranslation_table = intern ("translation-table");
7811 staticpro (&Qtranslation_table);
7812 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7813
7814 Qtranslation_table_id = intern ("translation-table-id");
7815 staticpro (&Qtranslation_table_id);
7816
7817 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7818 staticpro (&Qtranslation_table_for_decode);
7819
7820 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7821 staticpro (&Qtranslation_table_for_encode);
7822
7823 Qsafe_chars = intern ("safe-chars");
7824 staticpro (&Qsafe_chars);
7825
7826 Qchar_coding_system = intern ("char-coding-system");
7827 staticpro (&Qchar_coding_system);
7828
7829 /* Intern this now in case it isn't already done.
7830 Setting this variable twice is harmless.
7831 But don't staticpro it here--that is done in alloc.c. */
7832 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7833 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7834 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7835
7836 Qvalid_codes = intern ("valid-codes");
7837 staticpro (&Qvalid_codes);
7838
7839 Qascii_incompatible = intern ("ascii-incompatible");
7840 staticpro (&Qascii_incompatible);
7841
7842 Qemacs_mule = intern ("emacs-mule");
7843 staticpro (&Qemacs_mule);
7844
7845 Qraw_text = intern ("raw-text");
7846 staticpro (&Qraw_text);
7847
7848 Qutf_8 = intern ("utf-8");
7849 staticpro (&Qutf_8);
7850
7851 Qcoding_system_define_form = intern ("coding-system-define-form");
7852 staticpro (&Qcoding_system_define_form);
7853
7854 defsubr (&Scoding_system_p);
7855 defsubr (&Sread_coding_system);
7856 defsubr (&Sread_non_nil_coding_system);
7857 defsubr (&Scheck_coding_system);
7858 defsubr (&Sdetect_coding_region);
7859 defsubr (&Sdetect_coding_string);
7860 defsubr (&Sfind_coding_systems_region_internal);
7861 defsubr (&Sunencodable_char_position);
7862 defsubr (&Sdecode_coding_region);
7863 defsubr (&Sencode_coding_region);
7864 defsubr (&Sdecode_coding_string);
7865 defsubr (&Sencode_coding_string);
7866 defsubr (&Sdecode_sjis_char);
7867 defsubr (&Sencode_sjis_char);
7868 defsubr (&Sdecode_big5_char);
7869 defsubr (&Sencode_big5_char);
7870 defsubr (&Sset_terminal_coding_system_internal);
7871 defsubr (&Sset_safe_terminal_coding_system_internal);
7872 defsubr (&Sterminal_coding_system);
7873 defsubr (&Sset_keyboard_coding_system_internal);
7874 defsubr (&Skeyboard_coding_system);
7875 defsubr (&Sfind_operation_coding_system);
7876 defsubr (&Supdate_coding_systems_internal);
7877 defsubr (&Sset_coding_priority_internal);
7878 defsubr (&Sdefine_coding_system_internal);
7879
7880 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7881 doc: /* List of coding systems.
7882
7883 Do not alter the value of this variable manually. This variable should be
7884 updated by the functions `make-coding-system' and
7885 `define-coding-system-alias'. */);
7886 Vcoding_system_list = Qnil;
7887
7888 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7889 doc: /* Alist of coding system names.
7890 Each element is one element list of coding system name.
7891 This variable is given to `completing-read' as TABLE argument.
7892
7893 Do not alter the value of this variable manually. This variable should be
7894 updated by the functions `make-coding-system' and
7895 `define-coding-system-alias'. */);
7896 Vcoding_system_alist = Qnil;
7897
7898 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7899 doc: /* List of coding-categories (symbols) ordered by priority.
7900
7901 On detecting a coding system, Emacs tries code detection algorithms
7902 associated with each coding-category one by one in this order. When
7903 one algorithm agrees with a byte sequence of source text, the coding
7904 system bound to the corresponding coding-category is selected.
7905
7906 Don't modify this variable directly, but use `set-coding-priority'. */);
7907 {
7908 int i;
7909
7910 Vcoding_category_list = Qnil;
7911 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7912 Vcoding_category_list
7913 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7914 Vcoding_category_list);
7915 }
7916
7917 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7918 doc: /* Specify the coding system for read operations.
7919 It is useful to bind this variable with `let', but do not set it globally.
7920 If the value is a coding system, it is used for decoding on read operation.
7921 If not, an appropriate element is used from one of the coding system alists:
7922 There are three such tables, `file-coding-system-alist',
7923 `process-coding-system-alist', and `network-coding-system-alist'. */);
7924 Vcoding_system_for_read = Qnil;
7925
7926 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7927 doc: /* Specify the coding system for write operations.
7928 Programs bind this variable with `let', but you should not set it globally.
7929 If the value is a coding system, it is used for encoding of output,
7930 when writing it to a file and when sending it to a file or subprocess.
7931
7932 If this does not specify a coding system, an appropriate element
7933 is used from one of the coding system alists:
7934 There are three such tables, `file-coding-system-alist',
7935 `process-coding-system-alist', and `network-coding-system-alist'.
7936 For output to files, if the above procedure does not specify a coding system,
7937 the value of `buffer-file-coding-system' is used. */);
7938 Vcoding_system_for_write = Qnil;
7939
7940 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7941 doc: /* Coding system used in the latest file or process I/O.
7942 Also set by `encode-coding-region', `decode-coding-region',
7943 `encode-coding-string' and `decode-coding-string'. */);
7944 Vlast_coding_system_used = Qnil;
7945
7946 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7947 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7948 See info node `Coding Systems' and info node `Text and Binary' concerning
7949 such conversion. */);
7950 inhibit_eol_conversion = 0;
7951
7952 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7953 doc: /* Non-nil means process buffer inherits coding system of process output.
7954 Bind it to t if the process output is to be treated as if it were a file
7955 read from some filesystem. */);
7956 inherit_process_coding_system = 0;
7957
7958 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7959 doc: /* Alist to decide a coding system to use for a file I/O operation.
7960 The format is ((PATTERN . VAL) ...),
7961 where PATTERN is a regular expression matching a file name,
7962 VAL is a coding system, a cons of coding systems, or a function symbol.
7963 If VAL is a coding system, it is used for both decoding and encoding
7964 the file contents.
7965 If VAL is a cons of coding systems, the car part is used for decoding,
7966 and the cdr part is used for encoding.
7967 If VAL is a function symbol, the function must return a coding system
7968 or a cons of coding systems which are used as above. The function is
7969 called with an argument that is a list of the arguments with which
7970 `find-operation-coding-system' was called.
7971
7972 See also the function `find-operation-coding-system'
7973 and the variable `auto-coding-alist'. */);
7974 Vfile_coding_system_alist = Qnil;
7975
7976 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7977 doc: /* Alist to decide a coding system to use for a process I/O operation.
7978 The format is ((PATTERN . VAL) ...),
7979 where PATTERN is a regular expression matching a program name,
7980 VAL is a coding system, a cons of coding systems, or a function symbol.
7981 If VAL is a coding system, it is used for both decoding what received
7982 from the program and encoding what sent to the program.
7983 If VAL is a cons of coding systems, the car part is used for decoding,
7984 and the cdr part is used for encoding.
7985 If VAL is a function symbol, the function must return a coding system
7986 or a cons of coding systems which are used as above.
7987
7988 See also the function `find-operation-coding-system'. */);
7989 Vprocess_coding_system_alist = Qnil;
7990
7991 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7992 doc: /* Alist to decide a coding system to use for a network I/O operation.
7993 The format is ((PATTERN . VAL) ...),
7994 where PATTERN is a regular expression matching a network service name
7995 or is a port number to connect to,
7996 VAL is a coding system, a cons of coding systems, or a function symbol.
7997 If VAL is a coding system, it is used for both decoding what received
7998 from the network stream and encoding what sent to the network stream.
7999 If VAL is a cons of coding systems, the car part is used for decoding,
8000 and the cdr part is used for encoding.
8001 If VAL is a function symbol, the function must return a coding system
8002 or a cons of coding systems which are used as above.
8003
8004 See also the function `find-operation-coding-system'. */);
8005 Vnetwork_coding_system_alist = Qnil;
8006
8007 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8008 doc: /* Coding system to use with system messages.
8009 Also used for decoding keyboard input on X Window system. */);
8010 Vlocale_coding_system = Qnil;
8011
8012 /* The eol mnemonics are reset in startup.el system-dependently. */
8013 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8014 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8015 eol_mnemonic_unix = build_string (":");
8016
8017 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8018 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8019 eol_mnemonic_dos = build_string ("\\");
8020
8021 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8022 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8023 eol_mnemonic_mac = build_string ("/");
8024
8025 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8026 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8027 eol_mnemonic_undecided = build_string (":");
8028
8029 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8030 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8031 Venable_character_translation = Qt;
8032
8033 DEFVAR_LISP ("standard-translation-table-for-decode",
8034 &Vstandard_translation_table_for_decode,
8035 doc: /* Table for translating characters while decoding. */);
8036 Vstandard_translation_table_for_decode = Qnil;
8037
8038 DEFVAR_LISP ("standard-translation-table-for-encode",
8039 &Vstandard_translation_table_for_encode,
8040 doc: /* Table for translating characters while encoding. */);
8041 Vstandard_translation_table_for_encode = Qnil;
8042
8043 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8044 doc: /* Alist of charsets vs revision numbers.
8045 While encoding, if a charset (car part of an element) is found,
8046 designate it with the escape sequence identifying revision (cdr part of the element). */);
8047 Vcharset_revision_alist = Qnil;
8048
8049 DEFVAR_LISP ("default-process-coding-system",
8050 &Vdefault_process_coding_system,
8051 doc: /* Cons of coding systems used for process I/O by default.
8052 The car part is used for decoding a process output,
8053 the cdr part is used for encoding a text to be sent to a process. */);
8054 Vdefault_process_coding_system = Qnil;
8055
8056 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8057 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8058 This is a vector of length 256.
8059 If Nth element is non-nil, the existence of code N in a file
8060 \(or output of subprocess) doesn't prevent it to be detected as
8061 a coding system of ISO 2022 variant which has a flag
8062 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8063 or reading output of a subprocess.
8064 Only 128th through 159th elements has a meaning. */);
8065 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8066
8067 DEFVAR_LISP ("select-safe-coding-system-function",
8068 &Vselect_safe_coding_system_function,
8069 doc: /* Function to call to select safe coding system for encoding a text.
8070
8071 If set, this function is called to force a user to select a proper
8072 coding system which can encode the text in the case that a default
8073 coding system used in each operation can't encode the text.
8074
8075 The default value is `select-safe-coding-system' (which see). */);
8076 Vselect_safe_coding_system_function = Qnil;
8077
8078 DEFVAR_BOOL ("coding-system-require-warning",
8079 &coding_system_require_warning,
8080 doc: /* Internal use only.
8081 If non-nil, on writing a file, `select-safe-coding-system-function' is
8082 called even if `coding-system-for-write' is non-nil. The command
8083 `universal-coding-system-argument' binds this variable to t temporarily. */);
8084 coding_system_require_warning = 0;
8085
8086
8087 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8088 &inhibit_iso_escape_detection,
8089 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8090
8091 By default, on reading a file, Emacs tries to detect how the text is
8092 encoded. This code detection is sensitive to escape sequences. If
8093 the sequence is valid as ISO2022, the code is determined as one of
8094 the ISO2022 encodings, and the file is decoded by the corresponding
8095 coding system (e.g. `iso-2022-7bit').
8096
8097 However, there may be a case that you want to read escape sequences in
8098 a file as is. In such a case, you can set this variable to non-nil.
8099 Then, as the code detection ignores any escape sequences, no file is
8100 detected as encoded in some ISO2022 encoding. The result is that all
8101 escape sequences become visible in a buffer.
8102
8103 The default value is nil, and it is strongly recommended not to change
8104 it. That is because many Emacs Lisp source files that contain
8105 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8106 in Emacs's distribution, and they won't be decoded correctly on
8107 reading if you suppress escape sequence detection.
8108
8109 The other way to read escape sequences in a file without decoding is
8110 to explicitly specify some coding system that doesn't use ISO2022's
8111 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8112 inhibit_iso_escape_detection = 0;
8113
8114 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8115 doc: /* Char table for translating self-inserting characters.
8116 This is applied to the result of input methods, not their input. See also
8117 `keyboard-translate-table'. */);
8118 Vtranslation_table_for_input = Qnil;
8119 }
8120
8121 char *
8122 emacs_strerror (error_number)
8123 int error_number;
8124 {
8125 char *str;
8126
8127 synchronize_system_messages_locale ();
8128 str = strerror (error_number);
8129
8130 if (! NILP (Vlocale_coding_system))
8131 {
8132 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8133 Vlocale_coding_system,
8134 0);
8135 str = (char *) SDATA (dec);
8136 }
8137
8138 return str;
8139 }
8140
8141 #endif /* emacs */
8142
8143 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8144 (do not change this comment) */