]> code.delx.au - gnu-emacs/blob - src/coding.c
Merge from emacs--rel--22
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 3, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 Boston, MA 02110-1301, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-mule) handlers
31 3. ISO2022 handlers
32 4. Shift-JIS and BIG5 handlers
33 5. CCL handlers
34 6. End-of-line handlers
35 7. C library functions
36 8. Emacs Lisp library functions
37 9. Post-amble
38
39 */
40
41 /*** 0. General comments ***/
42
43
44 /*** GENERAL NOTE on CODING SYSTEMS ***
45
46 A coding system is an encoding mechanism for one or more character
47 sets. Here's a list of coding systems which Emacs can handle. When
48 we say "decode", it means converting some other coding system to
49 Emacs' internal format (emacs-mule), and when we say "encode",
50 it means converting the coding system emacs-mule to some other
51 coding system.
52
53 0. Emacs' internal format (emacs-mule)
54
55 Emacs itself holds a multi-lingual character in buffers and strings
56 in a special format. Details are described in section 2.
57
58 1. ISO2022
59
60 The most famous coding system for multiple character sets. X's
61 Compound Text, various EUCs (Extended Unix Code), and coding
62 systems used in Internet communication such as ISO-2022-JP are
63 all variants of ISO2022. Details are described in section 3.
64
65 2. SJIS (or Shift-JIS or MS-Kanji-Code)
66
67 A coding system to encode character sets: ASCII, JISX0201, and
68 JISX0208. Widely used for PC's in Japan. Details are described in
69 section 4.
70
71 3. BIG5
72
73 A coding system to encode the character sets ASCII and Big5. Widely
74 used for Chinese (mainly in Taiwan and Hong Kong). Details are
75 described in section 4. In this file, when we write "BIG5"
76 (all uppercase), we mean the coding system, and when we write
77 "Big5" (capitalized), we mean the character set.
78
79 4. Raw text
80
81 A coding system for text containing random 8-bit code. Emacs does
82 no code conversion on such text except for end-of-line format.
83
84 5. Other
85
86 If a user wants to read/write text encoded in a coding system not
87 listed above, he can supply a decoder and an encoder for it as CCL
88 (Code Conversion Language) programs. Emacs executes the CCL program
89 while reading/writing.
90
91 Emacs represents a coding system by a Lisp symbol that has a property
92 `coding-system'. But, before actually using the coding system, the
93 information about it is set in a structure of type `struct
94 coding_system' for rapid processing. See section 6 for more details.
95
96 */
97
98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
99
100 How end-of-line of text is encoded depends on the operating system.
101 For instance, Unix's format is just one byte of `line-feed' code,
102 whereas DOS's format is two-byte sequence of `carriage-return' and
103 `line-feed' codes. MacOS's format is usually one byte of
104 `carriage-return'.
105
106 Since text character encoding and end-of-line encoding are
107 independent, any coding system described above can have any
108 end-of-line format. So Emacs has information about end-of-line
109 format in each coding-system. See section 6 for more details.
110
111 */
112
113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
114
115 These functions check if a text between SRC and SRC_END is encoded
116 in the coding system category XXX. Each returns an integer value in
117 which appropriate flag bits for the category XXX are set. The flag
118 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
119 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
120 of the range 0x80..0x9F are in multibyte form. */
121 #if 0
122 int
123 detect_coding_emacs_mule (src, src_end, multibytep)
124 unsigned char *src, *src_end;
125 int multibytep;
126 {
127 ...
128 }
129 #endif
130
131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
132
133 These functions decode SRC_BYTES length of unibyte text at SOURCE
134 encoded in CODING to Emacs' internal format. The resulting
135 multibyte text goes to a place pointed to by DESTINATION, the length
136 of which should not exceed DST_BYTES.
137
138 These functions set the information about original and decoded texts
139 in the members `produced', `produced_char', `consumed', and
140 `consumed_char' of the structure *CODING. They also set the member
141 `result' to one of CODING_FINISH_XXX indicating how the decoding
142 finished.
143
144 DST_BYTES zero means that the source area and destination area are
145 overlapped, which means that we can produce a decoded text until it
146 reaches the head of the not-yet-decoded source text.
147
148 Below is a template for these functions. */
149 #if 0
150 static void
151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
152 struct coding_system *coding;
153 const unsigned char *source;
154 unsigned char *destination;
155 int src_bytes, dst_bytes;
156 {
157 ...
158 }
159 #endif
160
161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
162
163 These functions encode SRC_BYTES length text at SOURCE from Emacs'
164 internal multibyte format to CODING. The resulting unibyte text
165 goes to a place pointed to by DESTINATION, the length of which
166 should not exceed DST_BYTES.
167
168 These functions set the information about original and encoded texts
169 in the members `produced', `produced_char', `consumed', and
170 `consumed_char' of the structure *CODING. They also set the member
171 `result' to one of CODING_FINISH_XXX indicating how the encoding
172 finished.
173
174 DST_BYTES zero means that the source area and destination area are
175 overlapped, which means that we can produce encoded text until it
176 reaches at the head of the not-yet-encoded source text.
177
178 Below is a template for these functions. */
179 #if 0
180 static void
181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
182 struct coding_system *coding;
183 unsigned char *source, *destination;
184 int src_bytes, dst_bytes;
185 {
186 ...
187 }
188 #endif
189
190 /*** COMMONLY USED MACROS ***/
191
192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
193 get one, two, and three bytes from the source text respectively.
194 If there are not enough bytes in the source, they jump to
195 `label_end_of_loop'. The caller should set variables `coding',
196 `src' and `src_end' to appropriate pointer in advance. These
197 macros are called from decoding routines `decode_coding_XXX', thus
198 it is assumed that the source text is unibyte. */
199
200 #define ONE_MORE_BYTE(c1) \
201 do { \
202 if (src >= src_end) \
203 { \
204 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
205 goto label_end_of_loop; \
206 } \
207 c1 = *src++; \
208 } while (0)
209
210 #define TWO_MORE_BYTES(c1, c2) \
211 do { \
212 if (src + 1 >= src_end) \
213 { \
214 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
215 goto label_end_of_loop; \
216 } \
217 c1 = *src++; \
218 c2 = *src++; \
219 } while (0)
220
221
222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
223 form if MULTIBYTEP is nonzero. In addition, if SRC is not less
224 than SRC_END, return with RET. */
225
226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \
227 do { \
228 if (src >= src_end) \
229 { \
230 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
231 return ret; \
232 } \
233 c1 = *src++; \
234 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
235 c1 = *src++ - 0x20; \
236 } while (0)
237
238 /* Set C to the next character at the source text pointed by `src'.
239 If there are not enough characters in the source, jump to
240 `label_end_of_loop'. The caller should set variables `coding'
241 `src', `src_end', and `translation_table' to appropriate pointers
242 in advance. This macro is used in encoding routines
243 `encode_coding_XXX', thus it assumes that the source text is in
244 multibyte form except for 8-bit characters. 8-bit characters are
245 in multibyte form if coding->src_multibyte is nonzero, else they
246 are represented by a single byte. */
247
248 #define ONE_MORE_CHAR(c) \
249 do { \
250 int len = src_end - src; \
251 int bytes; \
252 if (len <= 0) \
253 { \
254 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
255 goto label_end_of_loop; \
256 } \
257 if (coding->src_multibyte \
258 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
259 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
260 else \
261 c = *src, bytes = 1; \
262 if (!NILP (translation_table)) \
263 c = translate_char (translation_table, c, -1, 0, 0); \
264 src += bytes; \
265 } while (0)
266
267
268 /* Produce a multibyte form of character C to `dst'. Jump to
269 `label_end_of_loop' if there's not enough space at `dst'.
270
271 If we are now in the middle of a composition sequence, the decoded
272 character may be ALTCHAR (for the current composition). In that
273 case, the character goes to coding->cmp_data->data instead of
274 `dst'.
275
276 This macro is used in decoding routines. */
277
278 #define EMIT_CHAR(c) \
279 do { \
280 if (! COMPOSING_P (coding) \
281 || coding->composing == COMPOSITION_RELATIVE \
282 || coding->composing == COMPOSITION_WITH_RULE) \
283 { \
284 int bytes = CHAR_BYTES (c); \
285 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
286 { \
287 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
288 goto label_end_of_loop; \
289 } \
290 dst += CHAR_STRING (c, dst); \
291 coding->produced_char++; \
292 } \
293 \
294 if (COMPOSING_P (coding) \
295 && coding->composing != COMPOSITION_RELATIVE) \
296 { \
297 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
298 coding->composition_rule_follows \
299 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
300 } \
301 } while (0)
302
303
304 #define EMIT_ONE_BYTE(c) \
305 do { \
306 if (dst >= (dst_bytes ? dst_end : src)) \
307 { \
308 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
309 goto label_end_of_loop; \
310 } \
311 *dst++ = c; \
312 } while (0)
313
314 #define EMIT_TWO_BYTES(c1, c2) \
315 do { \
316 if (dst + 2 > (dst_bytes ? dst_end : src)) \
317 { \
318 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
319 goto label_end_of_loop; \
320 } \
321 *dst++ = c1, *dst++ = c2; \
322 } while (0)
323
324 #define EMIT_BYTES(from, to) \
325 do { \
326 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
327 { \
328 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
329 goto label_end_of_loop; \
330 } \
331 while (from < to) \
332 *dst++ = *from++; \
333 } while (0)
334
335 \f
336 /*** 1. Preamble ***/
337
338 #ifdef emacs
339 #include <config.h>
340 #endif
341
342 #include <stdio.h>
343
344 #ifdef emacs
345
346 #include "lisp.h"
347 #include "buffer.h"
348 #include "charset.h"
349 #include "composite.h"
350 #include "ccl.h"
351 #include "coding.h"
352 #include "window.h"
353 #include "intervals.h"
354 #include "frame.h"
355 #include "termhooks.h"
356
357 #else /* not emacs */
358
359 #include "mulelib.h"
360
361 #endif /* not emacs */
362
363 Lisp_Object Qcoding_system, Qeol_type;
364 Lisp_Object Qbuffer_file_coding_system;
365 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
366 Lisp_Object Qno_conversion, Qundecided;
367 Lisp_Object Qcoding_system_history;
368 Lisp_Object Qsafe_chars;
369 Lisp_Object Qvalid_codes;
370 Lisp_Object Qascii_incompatible;
371
372 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
373 Lisp_Object Qcall_process, Qcall_process_region;
374 Lisp_Object Qstart_process, Qopen_network_stream;
375 Lisp_Object Qtarget_idx;
376
377 extern Lisp_Object Qcompletion_ignore_case;
378
379 /* If a symbol has this property, evaluate the value to define the
380 symbol as a coding system. */
381 Lisp_Object Qcoding_system_define_form;
382
383 Lisp_Object Vselect_safe_coding_system_function;
384
385 int coding_system_require_warning;
386
387 /* Mnemonic string for each format of end-of-line. */
388 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
389 /* Mnemonic string to indicate format of end-of-line is not yet
390 decided. */
391 Lisp_Object eol_mnemonic_undecided;
392
393 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
394 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
395 This has an effect only for external encoding (i.e. for output to
396 file and process), not for in-buffer or Lisp string encoding. */
397 int system_eol_type;
398
399 #ifdef emacs
400
401 /* Information about which coding system is safe for which chars.
402 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
403
404 GENERIC-LIST is a list of generic coding systems which can encode
405 any characters.
406
407 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
408 corresponding char table that contains safe chars. */
409 Lisp_Object Vcoding_system_safe_chars;
410
411 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
412
413 Lisp_Object Qcoding_system_p, Qcoding_system_error;
414
415 /* Coding system emacs-mule and raw-text are for converting only
416 end-of-line format. */
417 Lisp_Object Qemacs_mule, Qraw_text;
418
419 Lisp_Object Qutf_8;
420
421 /* Coding-systems are handed between Emacs Lisp programs and C internal
422 routines by the following three variables. */
423 /* Coding-system for reading files and receiving data from process. */
424 Lisp_Object Vcoding_system_for_read;
425 /* Coding-system for writing files and sending data to process. */
426 Lisp_Object Vcoding_system_for_write;
427 /* Coding-system actually used in the latest I/O. */
428 Lisp_Object Vlast_coding_system_used;
429
430 /* A vector of length 256 which contains information about special
431 Latin codes (especially for dealing with Microsoft codes). */
432 Lisp_Object Vlatin_extra_code_table;
433
434 /* Flag to inhibit code conversion of end-of-line format. */
435 int inhibit_eol_conversion;
436
437 /* Flag to inhibit ISO2022 escape sequence detection. */
438 int inhibit_iso_escape_detection;
439
440 /* Flag to make buffer-file-coding-system inherit from process-coding. */
441 int inherit_process_coding_system;
442
443 /* Coding system to be used to encode text for terminal display when
444 terminal coding system is nil. */
445 struct coding_system safe_terminal_coding;
446
447 /* Default coding system to be used to write a file. */
448 struct coding_system default_buffer_file_coding;
449
450 Lisp_Object Vfile_coding_system_alist;
451 Lisp_Object Vprocess_coding_system_alist;
452 Lisp_Object Vnetwork_coding_system_alist;
453
454 Lisp_Object Vlocale_coding_system;
455
456 #endif /* emacs */
457
458 Lisp_Object Qcoding_category, Qcoding_category_index;
459
460 /* List of symbols `coding-category-xxx' ordered by priority. */
461 Lisp_Object Vcoding_category_list;
462
463 /* Table of coding categories (Lisp symbols). */
464 Lisp_Object Vcoding_category_table;
465
466 /* Table of names of symbol for each coding-category. */
467 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
468 "coding-category-emacs-mule",
469 "coding-category-sjis",
470 "coding-category-iso-7",
471 "coding-category-iso-7-tight",
472 "coding-category-iso-8-1",
473 "coding-category-iso-8-2",
474 "coding-category-iso-7-else",
475 "coding-category-iso-8-else",
476 "coding-category-ccl",
477 "coding-category-big5",
478 "coding-category-utf-8",
479 "coding-category-utf-16-be",
480 "coding-category-utf-16-le",
481 "coding-category-raw-text",
482 "coding-category-binary"
483 };
484
485 /* Table of pointers to coding systems corresponding to each coding
486 categories. */
487 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
488
489 /* Table of coding category masks. Nth element is a mask for a coding
490 category of which priority is Nth. */
491 static
492 int coding_priorities[CODING_CATEGORY_IDX_MAX];
493
494 /* Flag to tell if we look up translation table on character code
495 conversion. */
496 Lisp_Object Venable_character_translation;
497 /* Standard translation table to look up on decoding (reading). */
498 Lisp_Object Vstandard_translation_table_for_decode;
499 /* Standard translation table to look up on encoding (writing). */
500 Lisp_Object Vstandard_translation_table_for_encode;
501
502 Lisp_Object Qtranslation_table;
503 Lisp_Object Qtranslation_table_id;
504 Lisp_Object Qtranslation_table_for_decode;
505 Lisp_Object Qtranslation_table_for_encode;
506
507 /* Alist of charsets vs revision number. */
508 Lisp_Object Vcharset_revision_alist;
509
510 /* Default coding systems used for process I/O. */
511 Lisp_Object Vdefault_process_coding_system;
512
513 /* Char table for translating Quail and self-inserting input. */
514 Lisp_Object Vtranslation_table_for_input;
515
516 /* Global flag to tell that we can't call post-read-conversion and
517 pre-write-conversion functions. Usually the value is zero, but it
518 is set to 1 temporarily while such functions are running. This is
519 to avoid infinite recursive call. */
520 static int inhibit_pre_post_conversion;
521
522 Lisp_Object Qchar_coding_system;
523
524 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
525 its validity. */
526
527 Lisp_Object
528 coding_safe_chars (coding_system)
529 Lisp_Object coding_system;
530 {
531 Lisp_Object coding_spec, plist, safe_chars;
532
533 coding_spec = Fget (coding_system, Qcoding_system);
534 plist = XVECTOR (coding_spec)->contents[3];
535 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
536 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
537 }
538
539 #define CODING_SAFE_CHAR_P(safe_chars, c) \
540 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
541
542 \f
543 /*** 2. Emacs internal format (emacs-mule) handlers ***/
544
545 /* Emacs' internal format for representation of multiple character
546 sets is a kind of multi-byte encoding, i.e. characters are
547 represented by variable-length sequences of one-byte codes.
548
549 ASCII characters and control characters (e.g. `tab', `newline') are
550 represented by one-byte sequences which are their ASCII codes, in
551 the range 0x00 through 0x7F.
552
553 8-bit characters of the range 0x80..0x9F are represented by
554 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
555 code + 0x20).
556
557 8-bit characters of the range 0xA0..0xFF are represented by
558 one-byte sequences which are their 8-bit code.
559
560 The other characters are represented by a sequence of `base
561 leading-code', optional `extended leading-code', and one or two
562 `position-code's. The length of the sequence is determined by the
563 base leading-code. Leading-code takes the range 0x81 through 0x9D,
564 whereas extended leading-code and position-code take the range 0xA0
565 through 0xFF. See `charset.h' for more details about leading-code
566 and position-code.
567
568 --- CODE RANGE of Emacs' internal format ---
569 character set range
570 ------------- -----
571 ascii 0x00..0x7F
572 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
573 eight-bit-graphic 0xA0..0xBF
574 ELSE 0x81..0x9D + [0xA0..0xFF]+
575 ---------------------------------------------
576
577 As this is the internal character representation, the format is
578 usually not used externally (i.e. in a file or in a data sent to a
579 process). But, it is possible to have a text externally in this
580 format (i.e. by encoding by the coding system `emacs-mule').
581
582 In that case, a sequence of one-byte codes has a slightly different
583 form.
584
585 Firstly, all characters in eight-bit-control are represented by
586 one-byte sequences which are their 8-bit code.
587
588 Next, character composition data are represented by the byte
589 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
590 where,
591 METHOD is 0xF0 plus one of composition method (enum
592 composition_method),
593
594 BYTES is 0xA0 plus the byte length of these composition data,
595
596 CHARS is 0xA0 plus the number of characters composed by these
597 data,
598
599 COMPONENTs are characters of multibyte form or composition
600 rules encoded by two-byte of ASCII codes.
601
602 In addition, for backward compatibility, the following formats are
603 also recognized as composition data on decoding.
604
605 0x80 MSEQ ...
606 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
607
608 Here,
609 MSEQ is a multibyte form but in these special format:
610 ASCII: 0xA0 ASCII_CODE+0x80,
611 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
612 RULE is a one byte code of the range 0xA0..0xF0 that
613 represents a composition rule.
614 */
615
616 enum emacs_code_class_type emacs_code_class[256];
617
618 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
619 Check if a text is encoded in Emacs' internal format. If it is,
620 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
621
622 static int
623 detect_coding_emacs_mule (src, src_end, multibytep)
624 unsigned char *src, *src_end;
625 int multibytep;
626 {
627 unsigned char c;
628 int composing = 0;
629 /* Dummy for ONE_MORE_BYTE. */
630 struct coding_system dummy_coding;
631 struct coding_system *coding = &dummy_coding;
632
633 while (1)
634 {
635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
636 CODING_CATEGORY_MASK_EMACS_MULE);
637 if (composing)
638 {
639 if (c < 0xA0)
640 composing = 0;
641 else if (c == 0xA0)
642 {
643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
644 c &= 0x7F;
645 }
646 else
647 c -= 0x20;
648 }
649
650 if (c < 0x20)
651 {
652 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
653 return 0;
654 }
655 else if (c >= 0x80 && c < 0xA0)
656 {
657 if (c == 0x80)
658 /* Old leading code for a composite character. */
659 composing = 1;
660 else
661 {
662 unsigned char *src_base = src - 1;
663 int bytes;
664
665 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
666 bytes))
667 return 0;
668 src = src_base + bytes;
669 }
670 }
671 }
672 }
673
674
675 /* Record the starting position START and METHOD of one composition. */
676
677 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
678 do { \
679 struct composition_data *cmp_data = coding->cmp_data; \
680 int *data = cmp_data->data + cmp_data->used; \
681 coding->cmp_data_start = cmp_data->used; \
682 data[0] = -1; \
683 data[1] = cmp_data->char_offset + start; \
684 data[3] = (int) method; \
685 cmp_data->used += 4; \
686 } while (0)
687
688 /* Record the ending position END of the current composition. */
689
690 #define CODING_ADD_COMPOSITION_END(coding, end) \
691 do { \
692 struct composition_data *cmp_data = coding->cmp_data; \
693 int *data = cmp_data->data + coding->cmp_data_start; \
694 data[0] = cmp_data->used - coding->cmp_data_start; \
695 data[2] = cmp_data->char_offset + end; \
696 } while (0)
697
698 /* Record one COMPONENT (alternate character or composition rule). */
699
700 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
701 do { \
702 coding->cmp_data->data[coding->cmp_data->used++] = component; \
703 if (coding->cmp_data->used - coding->cmp_data_start \
704 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
705 { \
706 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
707 coding->composing = COMPOSITION_NO; \
708 } \
709 } while (0)
710
711
712 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
713 is not less than SRC_END, return -1 without incrementing Src. */
714
715 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
716
717
718 /* Decode a character represented as a component of composition
719 sequence of Emacs 20 style at SRC. Set C to that character, store
720 its multibyte form sequence at P, and set P to the end of that
721 sequence. If no valid character is found, set C to -1. */
722
723 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
724 do { \
725 int bytes; \
726 \
727 c = SAFE_ONE_MORE_BYTE (); \
728 if (c < 0) \
729 break; \
730 if (CHAR_HEAD_P (c)) \
731 c = -1; \
732 else if (c == 0xA0) \
733 { \
734 c = SAFE_ONE_MORE_BYTE (); \
735 if (c < 0xA0) \
736 c = -1; \
737 else \
738 { \
739 c -= 0x80; \
740 *p++ = c; \
741 } \
742 } \
743 else if (BASE_LEADING_CODE_P (c - 0x20)) \
744 { \
745 unsigned char *p0 = p; \
746 \
747 c -= 0x20; \
748 *p++ = c; \
749 bytes = BYTES_BY_CHAR_HEAD (c); \
750 while (--bytes) \
751 { \
752 c = SAFE_ONE_MORE_BYTE (); \
753 if (c < 0) \
754 break; \
755 *p++ = c; \
756 } \
757 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
758 || (coding->flags /* We are recovering a file. */ \
759 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
760 && ! CHAR_HEAD_P (p0[1]))) \
761 c = STRING_CHAR (p0, bytes); \
762 else \
763 c = -1; \
764 } \
765 else \
766 c = -1; \
767 } while (0)
768
769
770 /* Decode a composition rule represented as a component of composition
771 sequence of Emacs 20 style at SRC. Set C to the rule. If not
772 valid rule is found, set C to -1. */
773
774 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
775 do { \
776 c = SAFE_ONE_MORE_BYTE (); \
777 c -= 0xA0; \
778 if (c < 0 || c >= 81) \
779 c = -1; \
780 else \
781 { \
782 gref = c / 9, nref = c % 9; \
783 c = COMPOSITION_ENCODE_RULE (gref, nref); \
784 } \
785 } while (0)
786
787
788 /* Decode composition sequence encoded by `emacs-mule' at the source
789 pointed by SRC. SRC_END is the end of source. Store information
790 of the composition in CODING->cmp_data.
791
792 For backward compatibility, decode also a composition sequence of
793 Emacs 20 style. In that case, the composition sequence contains
794 characters that should be extracted into a buffer or string. Store
795 those characters at *DESTINATION in multibyte form.
796
797 If we encounter an invalid byte sequence, return 0.
798 If we encounter an insufficient source or destination, or
799 insufficient space in CODING->cmp_data, return 1.
800 Otherwise, return consumed bytes in the source.
801
802 */
803 static INLINE int
804 decode_composition_emacs_mule (coding, src, src_end,
805 destination, dst_end, dst_bytes)
806 struct coding_system *coding;
807 const unsigned char *src, *src_end;
808 unsigned char **destination, *dst_end;
809 int dst_bytes;
810 {
811 unsigned char *dst = *destination;
812 int method, data_len, nchars;
813 const unsigned char *src_base = src++;
814 /* Store components of composition. */
815 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
816 int ncomponent;
817 /* Store multibyte form of characters to be composed. This is for
818 Emacs 20 style composition sequence. */
819 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
820 unsigned char *bufp = buf;
821 int c, i, gref, nref;
822
823 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
824 >= COMPOSITION_DATA_SIZE)
825 {
826 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
827 return -1;
828 }
829
830 ONE_MORE_BYTE (c);
831 if (c - 0xF0 >= COMPOSITION_RELATIVE
832 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
833 {
834 int with_rule;
835
836 method = c - 0xF0;
837 with_rule = (method == COMPOSITION_WITH_RULE
838 || method == COMPOSITION_WITH_RULE_ALTCHARS);
839 ONE_MORE_BYTE (c);
840 data_len = c - 0xA0;
841 if (data_len < 4
842 || src_base + data_len > src_end)
843 return 0;
844 ONE_MORE_BYTE (c);
845 nchars = c - 0xA0;
846 if (c < 1)
847 return 0;
848 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
849 {
850 /* If it is longer than this, it can't be valid. */
851 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
852 return 0;
853
854 if (ncomponent % 2 && with_rule)
855 {
856 ONE_MORE_BYTE (gref);
857 gref -= 32;
858 ONE_MORE_BYTE (nref);
859 nref -= 32;
860 c = COMPOSITION_ENCODE_RULE (gref, nref);
861 }
862 else
863 {
864 int bytes;
865 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
866 || (coding->flags /* We are recovering a file. */
867 && src[0] == LEADING_CODE_8_BIT_CONTROL
868 && ! CHAR_HEAD_P (src[1])))
869 c = STRING_CHAR (src, bytes);
870 else
871 c = *src, bytes = 1;
872 src += bytes;
873 }
874 component[ncomponent] = c;
875 }
876 }
877 else if (c >= 0x80)
878 {
879 /* This may be an old Emacs 20 style format. See the comment at
880 the section 2 of this file. */
881 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
882 if (src == src_end
883 && !(coding->mode & CODING_MODE_LAST_BLOCK))
884 goto label_end_of_loop;
885
886 src_end = src;
887 src = src_base + 1;
888 if (c < 0xC0)
889 {
890 method = COMPOSITION_RELATIVE;
891 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
892 {
893 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
894 if (c < 0)
895 break;
896 component[ncomponent++] = c;
897 }
898 if (ncomponent < 2)
899 return 0;
900 nchars = ncomponent;
901 }
902 else if (c == 0xFF)
903 {
904 method = COMPOSITION_WITH_RULE;
905 src++;
906 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
907 if (c < 0)
908 return 0;
909 component[0] = c;
910 for (ncomponent = 1;
911 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
912 {
913 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
914 if (c < 0)
915 break;
916 component[ncomponent++] = c;
917 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
918 if (c < 0)
919 break;
920 component[ncomponent++] = c;
921 }
922 if (ncomponent < 3)
923 return 0;
924 nchars = (ncomponent + 1) / 2;
925 }
926 else
927 return 0;
928 }
929 else
930 return 0;
931
932 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
933 {
934 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
935 for (i = 0; i < ncomponent; i++)
936 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
937 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
938 if (buf < bufp)
939 {
940 unsigned char *p = buf;
941 EMIT_BYTES (p, bufp);
942 *destination += bufp - buf;
943 coding->produced_char += nchars;
944 }
945 return (src - src_base);
946 }
947 label_end_of_loop:
948 return -1;
949 }
950
951 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
952
953 static void
954 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
955 struct coding_system *coding;
956 const unsigned char *source;
957 unsigned char *destination;
958 int src_bytes, dst_bytes;
959 {
960 const unsigned char *src = source;
961 const unsigned char *src_end = source + src_bytes;
962 unsigned char *dst = destination;
963 unsigned char *dst_end = destination + dst_bytes;
964 /* SRC_BASE remembers the start position in source in each loop.
965 The loop will be exited when there's not enough source code, or
966 when there's not enough destination area to produce a
967 character. */
968 const unsigned char *src_base;
969
970 coding->produced_char = 0;
971 while ((src_base = src) < src_end)
972 {
973 unsigned char tmp[MAX_MULTIBYTE_LENGTH];
974 const unsigned char *p;
975 int bytes;
976
977 if (*src == '\r')
978 {
979 int c = *src++;
980
981 if (coding->eol_type == CODING_EOL_CR)
982 c = '\n';
983 else if (coding->eol_type == CODING_EOL_CRLF)
984 {
985 ONE_MORE_BYTE (c);
986 if (c != '\n')
987 {
988 src--;
989 c = '\r';
990 }
991 }
992 *dst++ = c;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == '\n')
997 {
998 if ((coding->eol_type == CODING_EOL_CR
999 || coding->eol_type == CODING_EOL_CRLF)
1000 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1001 {
1002 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1003 goto label_end_of_loop;
1004 }
1005 *dst++ = *src++;
1006 coding->produced_char++;
1007 continue;
1008 }
1009 else if (*src == 0x80 && coding->cmp_data)
1010 {
1011 /* Start of composition data. */
1012 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1013 &dst, dst_end,
1014 dst_bytes);
1015 if (consumed < 0)
1016 goto label_end_of_loop;
1017 else if (consumed > 0)
1018 {
1019 src += consumed;
1020 continue;
1021 }
1022 bytes = CHAR_STRING (*src, tmp);
1023 p = tmp;
1024 src++;
1025 }
1026 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1027 || (coding->flags /* We are recovering a file. */
1028 && src[0] == LEADING_CODE_8_BIT_CONTROL
1029 && ! CHAR_HEAD_P (src[1])))
1030 {
1031 p = src;
1032 src += bytes;
1033 }
1034 else
1035 {
1036 int i, c;
1037
1038 bytes = BYTES_BY_CHAR_HEAD (*src);
1039 src++;
1040 for (i = 1; i < bytes; i++)
1041 {
1042 ONE_MORE_BYTE (c);
1043 if (CHAR_HEAD_P (c))
1044 break;
1045 }
1046 if (i < bytes)
1047 {
1048 bytes = CHAR_STRING (*src_base, tmp);
1049 p = tmp;
1050 src = src_base + 1;
1051 }
1052 else
1053 {
1054 p = src_base;
1055 }
1056 }
1057 if (dst + bytes >= (dst_bytes ? dst_end : src))
1058 {
1059 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1060 break;
1061 }
1062 while (bytes--) *dst++ = *p++;
1063 coding->produced_char++;
1064 }
1065 label_end_of_loop:
1066 coding->consumed = coding->consumed_char = src_base - source;
1067 coding->produced = dst - destination;
1068 }
1069
1070
1071 /* Encode composition data stored at DATA into a special byte sequence
1072 starting by 0x80. Update CODING->cmp_data_start and maybe
1073 CODING->cmp_data for the next call. */
1074
1075 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1076 do { \
1077 unsigned char buf[1024], *p0 = buf, *p; \
1078 int len = data[0]; \
1079 int i; \
1080 \
1081 buf[0] = 0x80; \
1082 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1083 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1084 p = buf + 4; \
1085 if (data[3] == COMPOSITION_WITH_RULE \
1086 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1087 { \
1088 p += CHAR_STRING (data[4], p); \
1089 for (i = 5; i < len; i += 2) \
1090 { \
1091 int gref, nref; \
1092 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1093 *p++ = 0x20 + gref; \
1094 *p++ = 0x20 + nref; \
1095 p += CHAR_STRING (data[i + 1], p); \
1096 } \
1097 } \
1098 else \
1099 { \
1100 for (i = 4; i < len; i++) \
1101 p += CHAR_STRING (data[i], p); \
1102 } \
1103 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1104 \
1105 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1106 { \
1107 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1108 goto label_end_of_loop; \
1109 } \
1110 while (p0 < p) \
1111 *dst++ = *p0++; \
1112 coding->cmp_data_start += data[0]; \
1113 if (coding->cmp_data_start == coding->cmp_data->used \
1114 && coding->cmp_data->next) \
1115 { \
1116 coding->cmp_data = coding->cmp_data->next; \
1117 coding->cmp_data_start = 0; \
1118 } \
1119 } while (0)
1120
1121
1122 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1123 unsigned char *, int, int));
1124
1125 static void
1126 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1127 struct coding_system *coding;
1128 const unsigned char *source;
1129 unsigned char *destination;
1130 int src_bytes, dst_bytes;
1131 {
1132 const unsigned char *src = source;
1133 const unsigned char *src_end = source + src_bytes;
1134 unsigned char *dst = destination;
1135 unsigned char *dst_end = destination + dst_bytes;
1136 const unsigned char *src_base;
1137 int c;
1138 int char_offset;
1139 int *data;
1140
1141 Lisp_Object translation_table;
1142
1143 translation_table = Qnil;
1144
1145 /* Optimization for the case that there's no composition. */
1146 if (!coding->cmp_data || coding->cmp_data->used == 0)
1147 {
1148 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1149 return;
1150 }
1151
1152 char_offset = coding->cmp_data->char_offset;
1153 data = coding->cmp_data->data + coding->cmp_data_start;
1154 while (1)
1155 {
1156 src_base = src;
1157
1158 /* If SRC starts a composition, encode the information about the
1159 composition in advance. */
1160 if (coding->cmp_data_start < coding->cmp_data->used
1161 && char_offset + coding->consumed_char == data[1])
1162 {
1163 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1164 char_offset = coding->cmp_data->char_offset;
1165 data = coding->cmp_data->data + coding->cmp_data_start;
1166 }
1167
1168 ONE_MORE_CHAR (c);
1169 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1170 || coding->eol_type == CODING_EOL_CR))
1171 {
1172 if (coding->eol_type == CODING_EOL_CRLF)
1173 EMIT_TWO_BYTES ('\r', c);
1174 else
1175 EMIT_ONE_BYTE ('\r');
1176 }
1177 else if (SINGLE_BYTE_CHAR_P (c))
1178 {
1179 if (coding->flags && ! ASCII_BYTE_P (c))
1180 {
1181 /* As we are auto saving, retain the multibyte form for
1182 8-bit chars. */
1183 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1184 int bytes = CHAR_STRING (c, buf);
1185
1186 if (bytes == 1)
1187 EMIT_ONE_BYTE (buf[0]);
1188 else
1189 EMIT_TWO_BYTES (buf[0], buf[1]);
1190 }
1191 else
1192 EMIT_ONE_BYTE (c);
1193 }
1194 else
1195 EMIT_BYTES (src_base, src);
1196 coding->consumed_char++;
1197 }
1198 label_end_of_loop:
1199 coding->consumed = src_base - source;
1200 coding->produced = coding->produced_char = dst - destination;
1201 return;
1202 }
1203
1204 \f
1205 /*** 3. ISO2022 handlers ***/
1206
1207 /* The following note describes the coding system ISO2022 briefly.
1208 Since the intention of this note is to help understand the
1209 functions in this file, some parts are NOT ACCURATE or are OVERLY
1210 SIMPLIFIED. For thorough understanding, please refer to the
1211 original document of ISO2022. This is equivalent to the standard
1212 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1213
1214 ISO2022 provides many mechanisms to encode several character sets
1215 in 7-bit and 8-bit environments. For 7-bit environments, all text
1216 is encoded using bytes less than 128. This may make the encoded
1217 text a little bit longer, but the text passes more easily through
1218 several types of gateway, some of which strip off the MSB (Most
1219 Significant Bit).
1220
1221 There are two kinds of character sets: control character sets and
1222 graphic character sets. The former contain control characters such
1223 as `newline' and `escape' to provide control functions (control
1224 functions are also provided by escape sequences). The latter
1225 contain graphic characters such as 'A' and '-'. Emacs recognizes
1226 two control character sets and many graphic character sets.
1227
1228 Graphic character sets are classified into one of the following
1229 four classes, according to the number of bytes (DIMENSION) and
1230 number of characters in one dimension (CHARS) of the set:
1231 - DIMENSION1_CHARS94
1232 - DIMENSION1_CHARS96
1233 - DIMENSION2_CHARS94
1234 - DIMENSION2_CHARS96
1235
1236 In addition, each character set is assigned an identification tag,
1237 unique for each set, called the "final character" (denoted as <F>
1238 hereafter). The <F> of each character set is decided by ECMA(*)
1239 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1240 (0x30..0x3F are for private use only).
1241
1242 Note (*): ECMA = European Computer Manufacturers Association
1243
1244 Here are examples of graphic character sets [NAME(<F>)]:
1245 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1246 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1247 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1248 o DIMENSION2_CHARS96 -- none for the moment
1249
1250 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1251 C0 [0x00..0x1F] -- control character plane 0
1252 GL [0x20..0x7F] -- graphic character plane 0
1253 C1 [0x80..0x9F] -- control character plane 1
1254 GR [0xA0..0xFF] -- graphic character plane 1
1255
1256 A control character set is directly designated and invoked to C0 or
1257 C1 by an escape sequence. The most common case is that:
1258 - ISO646's control character set is designated/invoked to C0, and
1259 - ISO6429's control character set is designated/invoked to C1,
1260 and usually these designations/invocations are omitted in encoded
1261 text. In a 7-bit environment, only C0 can be used, and a control
1262 character for C1 is encoded by an appropriate escape sequence to
1263 fit into the environment. All control characters for C1 are
1264 defined to have corresponding escape sequences.
1265
1266 A graphic character set is at first designated to one of four
1267 graphic registers (G0 through G3), then these graphic registers are
1268 invoked to GL or GR. These designations and invocations can be
1269 done independently. The most common case is that G0 is invoked to
1270 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1271 these invocations and designations are omitted in encoded text.
1272 In a 7-bit environment, only GL can be used.
1273
1274 When a graphic character set of CHARS94 is invoked to GL, codes
1275 0x20 and 0x7F of the GL area work as control characters SPACE and
1276 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1277 be used.
1278
1279 There are two ways of invocation: locking-shift and single-shift.
1280 With locking-shift, the invocation lasts until the next different
1281 invocation, whereas with single-shift, the invocation affects the
1282 following character only and doesn't affect the locking-shift
1283 state. Invocations are done by the following control characters or
1284 escape sequences:
1285
1286 ----------------------------------------------------------------------
1287 abbrev function cntrl escape seq description
1288 ----------------------------------------------------------------------
1289 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1290 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1291 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1292 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1293 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1294 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1295 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1296 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1297 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1298 ----------------------------------------------------------------------
1299 (*) These are not used by any known coding system.
1300
1301 Control characters for these functions are defined by macros
1302 ISO_CODE_XXX in `coding.h'.
1303
1304 Designations are done by the following escape sequences:
1305 ----------------------------------------------------------------------
1306 escape sequence description
1307 ----------------------------------------------------------------------
1308 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1309 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1310 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1311 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1312 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1313 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1314 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1315 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1316 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1317 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1318 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1319 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1320 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1321 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1322 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1323 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1324 ----------------------------------------------------------------------
1325
1326 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1327 of dimension 1, chars 94, and final character <F>, etc...
1328
1329 Note (*): Although these designations are not allowed in ISO2022,
1330 Emacs accepts them on decoding, and produces them on encoding
1331 CHARS96 character sets in a coding system which is characterized as
1332 7-bit environment, non-locking-shift, and non-single-shift.
1333
1334 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1335 '(' can be omitted. We refer to this as "short-form" hereafter.
1336
1337 Now you may notice that there are a lot of ways of encoding the
1338 same multilingual text in ISO2022. Actually, there exist many
1339 coding systems such as Compound Text (used in X11's inter client
1340 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1341 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1342 localized platforms), and all of these are variants of ISO2022.
1343
1344 In addition to the above, Emacs handles two more kinds of escape
1345 sequences: ISO6429's direction specification and Emacs' private
1346 sequence for specifying character composition.
1347
1348 ISO6429's direction specification takes the following form:
1349 o CSI ']' -- end of the current direction
1350 o CSI '0' ']' -- end of the current direction
1351 o CSI '1' ']' -- start of left-to-right text
1352 o CSI '2' ']' -- start of right-to-left text
1353 The control character CSI (0x9B: control sequence introducer) is
1354 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1355
1356 Character composition specification takes the following form:
1357 o ESC '0' -- start relative composition
1358 o ESC '1' -- end composition
1359 o ESC '2' -- start rule-base composition (*)
1360 o ESC '3' -- start relative composition with alternate chars (**)
1361 o ESC '4' -- start rule-base composition with alternate chars (**)
1362 Since these are not standard escape sequences of any ISO standard,
1363 the use of them with these meanings is restricted to Emacs only.
1364
1365 (*) This form is used only in Emacs 20.5 and older versions,
1366 but the newer versions can safely decode it.
1367 (**) This form is used only in Emacs 21.1 and newer versions,
1368 and the older versions can't decode it.
1369
1370 Here's a list of example usages of these composition escape
1371 sequences (categorized by `enum composition_method').
1372
1373 COMPOSITION_RELATIVE:
1374 ESC 0 CHAR [ CHAR ] ESC 1
1375 COMPOSITION_WITH_RULE:
1376 ESC 2 CHAR [ RULE CHAR ] ESC 1
1377 COMPOSITION_WITH_ALTCHARS:
1378 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1379 COMPOSITION_WITH_RULE_ALTCHARS:
1380 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1381
1382 enum iso_code_class_type iso_code_class[256];
1383
1384 #define CHARSET_OK(idx, charset, c) \
1385 (coding_system_table[idx] \
1386 && (charset == CHARSET_ASCII \
1387 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1388 CODING_SAFE_CHAR_P (safe_chars, c))) \
1389 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1390 charset) \
1391 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1392
1393 #define SHIFT_OUT_OK(idx) \
1394 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1395
1396 #define COMPOSITION_OK(idx) \
1397 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1398
1399 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1400 Check if a text is encoded in ISO2022. If it is, return an
1401 integer in which appropriate flag bits any of:
1402 CODING_CATEGORY_MASK_ISO_7
1403 CODING_CATEGORY_MASK_ISO_7_TIGHT
1404 CODING_CATEGORY_MASK_ISO_8_1
1405 CODING_CATEGORY_MASK_ISO_8_2
1406 CODING_CATEGORY_MASK_ISO_7_ELSE
1407 CODING_CATEGORY_MASK_ISO_8_ELSE
1408 are set. If a code which should never appear in ISO2022 is found,
1409 returns 0. */
1410
1411 static int
1412 detect_coding_iso2022 (src, src_end, multibytep)
1413 unsigned char *src, *src_end;
1414 int multibytep;
1415 {
1416 int mask = CODING_CATEGORY_MASK_ISO;
1417 int mask_found = 0;
1418 int reg[4], shift_out = 0, single_shifting = 0;
1419 int c, c1, charset;
1420 /* Dummy for ONE_MORE_BYTE. */
1421 struct coding_system dummy_coding;
1422 struct coding_system *coding = &dummy_coding;
1423 Lisp_Object safe_chars;
1424
1425 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1426 while (mask)
1427 {
1428 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1429 retry:
1430 switch (c)
1431 {
1432 case ISO_CODE_ESC:
1433 if (inhibit_iso_escape_detection)
1434 break;
1435 single_shifting = 0;
1436 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1437 if (c >= '(' && c <= '/')
1438 {
1439 /* Designation sequence for a charset of dimension 1. */
1440 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1441 if (c1 < ' ' || c1 >= 0x80
1442 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1443 /* Invalid designation sequence. Just ignore. */
1444 break;
1445 reg[(c - '(') % 4] = charset;
1446 }
1447 else if (c == '$')
1448 {
1449 /* Designation sequence for a charset of dimension 2. */
1450 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1451 if (c >= '@' && c <= 'B')
1452 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1453 reg[0] = charset = iso_charset_table[1][0][c];
1454 else if (c >= '(' && c <= '/')
1455 {
1456 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1457 mask & mask_found);
1458 if (c1 < ' ' || c1 >= 0x80
1459 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1460 /* Invalid designation sequence. Just ignore. */
1461 break;
1462 reg[(c - '(') % 4] = charset;
1463 }
1464 else
1465 /* Invalid designation sequence. Just ignore. */
1466 break;
1467 }
1468 else if (c == 'N' || c == 'O')
1469 {
1470 /* ESC <Fe> for SS2 or SS3. */
1471 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1472 break;
1473 }
1474 else if (c >= '0' && c <= '4')
1475 {
1476 /* ESC <Fp> for start/end composition. */
1477 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1478 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1479 else
1480 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1481 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1482 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1483 else
1484 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1485 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1486 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1487 else
1488 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1489 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1490 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1491 else
1492 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1493 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1494 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1495 else
1496 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1497 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1498 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1499 else
1500 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1501 break;
1502 }
1503 else
1504 /* Invalid escape sequence. Just ignore. */
1505 break;
1506
1507 /* We found a valid designation sequence for CHARSET. */
1508 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1509 c = MAKE_CHAR (charset, 0, 0);
1510 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1511 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1512 else
1513 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1514 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1515 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1516 else
1517 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1518 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1519 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1520 else
1521 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1522 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1523 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1524 else
1525 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1526 break;
1527
1528 case ISO_CODE_SO:
1529 if (inhibit_iso_escape_detection)
1530 break;
1531 single_shifting = 0;
1532 if (shift_out == 0
1533 && (reg[1] >= 0
1534 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1535 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1536 {
1537 /* Locking shift out. */
1538 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1539 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1540 }
1541 break;
1542
1543 case ISO_CODE_SI:
1544 if (inhibit_iso_escape_detection)
1545 break;
1546 single_shifting = 0;
1547 if (shift_out == 1)
1548 {
1549 /* Locking shift in. */
1550 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1551 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1552 }
1553 break;
1554
1555 case ISO_CODE_CSI:
1556 single_shifting = 0;
1557 case ISO_CODE_SS2:
1558 case ISO_CODE_SS3:
1559 {
1560 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1561
1562 if (inhibit_iso_escape_detection)
1563 break;
1564 if (c != ISO_CODE_CSI)
1565 {
1566 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1567 & CODING_FLAG_ISO_SINGLE_SHIFT)
1568 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1569 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1570 & CODING_FLAG_ISO_SINGLE_SHIFT)
1571 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1572 single_shifting = 1;
1573 }
1574 if (VECTORP (Vlatin_extra_code_table)
1575 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1576 {
1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1578 & CODING_FLAG_ISO_LATIN_EXTRA)
1579 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1580 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1581 & CODING_FLAG_ISO_LATIN_EXTRA)
1582 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1583 }
1584 mask &= newmask;
1585 mask_found |= newmask;
1586 }
1587 break;
1588
1589 default:
1590 if (c < 0x80)
1591 {
1592 single_shifting = 0;
1593 break;
1594 }
1595 else if (c < 0xA0)
1596 {
1597 single_shifting = 0;
1598 if (VECTORP (Vlatin_extra_code_table)
1599 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1600 {
1601 int newmask = 0;
1602
1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1604 & CODING_FLAG_ISO_LATIN_EXTRA)
1605 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1606 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1607 & CODING_FLAG_ISO_LATIN_EXTRA)
1608 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1609 mask &= newmask;
1610 mask_found |= newmask;
1611 }
1612 else
1613 return 0;
1614 }
1615 else
1616 {
1617 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1618 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1619 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1620 /* Check the length of succeeding codes of the range
1621 0xA0..0FF. If the byte length is odd, we exclude
1622 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1623 when we are not single shifting. */
1624 if (!single_shifting
1625 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1626 {
1627 int i = 1;
1628
1629 c = -1;
1630 while (src < src_end)
1631 {
1632 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1633 mask & mask_found);
1634 if (c < 0xA0)
1635 break;
1636 i++;
1637 }
1638
1639 if (i & 1 && src < src_end)
1640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1641 else
1642 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1643 if (c >= 0)
1644 /* This means that we have read one extra byte. */
1645 goto retry;
1646 }
1647 }
1648 break;
1649 }
1650 }
1651 return (mask & mask_found);
1652 }
1653
1654 /* Decode a character of which charset is CHARSET, the 1st position
1655 code is C1, the 2nd position code is C2, and return the decoded
1656 character code. If the variable `translation_table' is non-nil,
1657 returned the translated code. */
1658
1659 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1660 (NILP (translation_table) \
1661 ? MAKE_CHAR (charset, c1, c2) \
1662 : translate_char (translation_table, -1, charset, c1, c2))
1663
1664 /* Set designation state into CODING. */
1665 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1666 do { \
1667 int charset, c; \
1668 \
1669 if (final_char < '0' || final_char >= 128) \
1670 goto label_invalid_code; \
1671 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1672 make_number (chars), \
1673 make_number (final_char)); \
1674 c = MAKE_CHAR (charset, 0, 0); \
1675 if (charset >= 0 \
1676 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1677 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1678 { \
1679 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1680 && reg == 0 \
1681 && charset == CHARSET_ASCII) \
1682 { \
1683 /* We should insert this designation sequence as is so \
1684 that it is surely written back to a file. */ \
1685 coding->spec.iso2022.last_invalid_designation_register = -1; \
1686 goto label_invalid_code; \
1687 } \
1688 coding->spec.iso2022.last_invalid_designation_register = -1; \
1689 if ((coding->mode & CODING_MODE_DIRECTION) \
1690 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1691 charset = CHARSET_REVERSE_CHARSET (charset); \
1692 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1693 } \
1694 else \
1695 { \
1696 coding->spec.iso2022.last_invalid_designation_register = reg; \
1697 goto label_invalid_code; \
1698 } \
1699 } while (0)
1700
1701 /* Allocate a memory block for storing information about compositions.
1702 The block is chained to the already allocated blocks. */
1703
1704 void
1705 coding_allocate_composition_data (coding, char_offset)
1706 struct coding_system *coding;
1707 int char_offset;
1708 {
1709 struct composition_data *cmp_data
1710 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1711
1712 cmp_data->char_offset = char_offset;
1713 cmp_data->used = 0;
1714 cmp_data->prev = coding->cmp_data;
1715 cmp_data->next = NULL;
1716 if (coding->cmp_data)
1717 coding->cmp_data->next = cmp_data;
1718 coding->cmp_data = cmp_data;
1719 coding->cmp_data_start = 0;
1720 coding->composing = COMPOSITION_NO;
1721 }
1722
1723 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1724 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1725 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1726 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1727 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1728 */
1729
1730 #define DECODE_COMPOSITION_START(c1) \
1731 do { \
1732 if (coding->composing == COMPOSITION_DISABLED) \
1733 { \
1734 *dst++ = ISO_CODE_ESC; \
1735 *dst++ = c1 & 0x7f; \
1736 coding->produced_char += 2; \
1737 } \
1738 else if (!COMPOSING_P (coding)) \
1739 { \
1740 /* This is surely the start of a composition. We must be sure \
1741 that coding->cmp_data has enough space to store the \
1742 information about the composition. If not, terminate the \
1743 current decoding loop, allocate one more memory block for \
1744 coding->cmp_data in the caller, then start the decoding \
1745 loop again. We can't allocate memory here directly because \
1746 it may cause buffer/string relocation. */ \
1747 if (!coding->cmp_data \
1748 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1749 >= COMPOSITION_DATA_SIZE)) \
1750 { \
1751 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1752 goto label_end_of_loop; \
1753 } \
1754 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1755 : c1 == '2' ? COMPOSITION_WITH_RULE \
1756 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1757 : COMPOSITION_WITH_RULE_ALTCHARS); \
1758 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1759 coding->composing); \
1760 coding->composition_rule_follows = 0; \
1761 } \
1762 else \
1763 { \
1764 /* We are already handling a composition. If the method is \
1765 the following two, the codes following the current escape \
1766 sequence are actual characters stored in a buffer. */ \
1767 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1768 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1769 { \
1770 coding->composing = COMPOSITION_RELATIVE; \
1771 coding->composition_rule_follows = 0; \
1772 } \
1773 } \
1774 } while (0)
1775
1776 /* Handle composition end sequence ESC 1. */
1777
1778 #define DECODE_COMPOSITION_END(c1) \
1779 do { \
1780 if (! COMPOSING_P (coding)) \
1781 { \
1782 *dst++ = ISO_CODE_ESC; \
1783 *dst++ = c1; \
1784 coding->produced_char += 2; \
1785 } \
1786 else \
1787 { \
1788 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1789 coding->composing = COMPOSITION_NO; \
1790 } \
1791 } while (0)
1792
1793 /* Decode a composition rule from the byte C1 (and maybe one more byte
1794 from SRC) and store one encoded composition rule in
1795 coding->cmp_data. */
1796
1797 #define DECODE_COMPOSITION_RULE(c1) \
1798 do { \
1799 int rule = 0; \
1800 (c1) -= 32; \
1801 if (c1 < 81) /* old format (before ver.21) */ \
1802 { \
1803 int gref = (c1) / 9; \
1804 int nref = (c1) % 9; \
1805 if (gref == 4) gref = 10; \
1806 if (nref == 4) nref = 10; \
1807 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1808 } \
1809 else if (c1 < 93) /* new format (after ver.21) */ \
1810 { \
1811 ONE_MORE_BYTE (c2); \
1812 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1813 } \
1814 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1815 coding->composition_rule_follows = 0; \
1816 } while (0)
1817
1818
1819 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1820
1821 static void
1822 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1823 struct coding_system *coding;
1824 const unsigned char *source;
1825 unsigned char *destination;
1826 int src_bytes, dst_bytes;
1827 {
1828 const unsigned char *src = source;
1829 const unsigned char *src_end = source + src_bytes;
1830 unsigned char *dst = destination;
1831 unsigned char *dst_end = destination + dst_bytes;
1832 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1833 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1834 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1835 /* SRC_BASE remembers the start position in source in each loop.
1836 The loop will be exited when there's not enough source code
1837 (within macro ONE_MORE_BYTE), or when there's not enough
1838 destination area to produce a character (within macro
1839 EMIT_CHAR). */
1840 const unsigned char *src_base;
1841 int c, charset;
1842 Lisp_Object translation_table;
1843 Lisp_Object safe_chars;
1844
1845 safe_chars = coding_safe_chars (coding->symbol);
1846
1847 if (NILP (Venable_character_translation))
1848 translation_table = Qnil;
1849 else
1850 {
1851 translation_table = coding->translation_table_for_decode;
1852 if (NILP (translation_table))
1853 translation_table = Vstandard_translation_table_for_decode;
1854 }
1855
1856 coding->result = CODING_FINISH_NORMAL;
1857
1858 while (1)
1859 {
1860 int c1, c2 = 0;
1861
1862 src_base = src;
1863 ONE_MORE_BYTE (c1);
1864
1865 /* We produce no character or one character. */
1866 switch (iso_code_class [c1])
1867 {
1868 case ISO_0x20_or_0x7F:
1869 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1870 {
1871 DECODE_COMPOSITION_RULE (c1);
1872 continue;
1873 }
1874 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1875 {
1876 /* This is SPACE or DEL. */
1877 charset = CHARSET_ASCII;
1878 break;
1879 }
1880 /* This is a graphic character, we fall down ... */
1881
1882 case ISO_graphic_plane_0:
1883 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1884 {
1885 DECODE_COMPOSITION_RULE (c1);
1886 continue;
1887 }
1888 charset = charset0;
1889 break;
1890
1891 case ISO_0xA0_or_0xFF:
1892 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1893 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1894 goto label_invalid_code;
1895 /* This is a graphic character, we fall down ... */
1896
1897 case ISO_graphic_plane_1:
1898 if (charset1 < 0)
1899 goto label_invalid_code;
1900 charset = charset1;
1901 break;
1902
1903 case ISO_control_0:
1904 if (COMPOSING_P (coding))
1905 DECODE_COMPOSITION_END ('1');
1906
1907 /* All ISO2022 control characters in this class have the
1908 same representation in Emacs internal format. */
1909 if (c1 == '\n'
1910 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1911 && (coding->eol_type == CODING_EOL_CR
1912 || coding->eol_type == CODING_EOL_CRLF))
1913 {
1914 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1915 goto label_end_of_loop;
1916 }
1917 charset = CHARSET_ASCII;
1918 break;
1919
1920 case ISO_control_1:
1921 if (COMPOSING_P (coding))
1922 DECODE_COMPOSITION_END ('1');
1923 goto label_invalid_code;
1924
1925 case ISO_carriage_return:
1926 if (COMPOSING_P (coding))
1927 DECODE_COMPOSITION_END ('1');
1928
1929 if (coding->eol_type == CODING_EOL_CR)
1930 c1 = '\n';
1931 else if (coding->eol_type == CODING_EOL_CRLF)
1932 {
1933 ONE_MORE_BYTE (c1);
1934 if (c1 != ISO_CODE_LF)
1935 {
1936 src--;
1937 c1 = '\r';
1938 }
1939 }
1940 charset = CHARSET_ASCII;
1941 break;
1942
1943 case ISO_shift_out:
1944 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1945 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1946 goto label_invalid_code;
1947 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1948 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1949 continue;
1950
1951 case ISO_shift_in:
1952 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1953 goto label_invalid_code;
1954 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1955 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1956 continue;
1957
1958 case ISO_single_shift_2_7:
1959 case ISO_single_shift_2:
1960 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1961 goto label_invalid_code;
1962 /* SS2 is handled as an escape sequence of ESC 'N' */
1963 c1 = 'N';
1964 goto label_escape_sequence;
1965
1966 case ISO_single_shift_3:
1967 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1968 goto label_invalid_code;
1969 /* SS2 is handled as an escape sequence of ESC 'O' */
1970 c1 = 'O';
1971 goto label_escape_sequence;
1972
1973 case ISO_control_sequence_introducer:
1974 /* CSI is handled as an escape sequence of ESC '[' ... */
1975 c1 = '[';
1976 goto label_escape_sequence;
1977
1978 case ISO_escape:
1979 ONE_MORE_BYTE (c1);
1980 label_escape_sequence:
1981 /* Escape sequences handled by Emacs are invocation,
1982 designation, direction specification, and character
1983 composition specification. */
1984 switch (c1)
1985 {
1986 case '&': /* revision of following character set */
1987 ONE_MORE_BYTE (c1);
1988 if (!(c1 >= '@' && c1 <= '~'))
1989 goto label_invalid_code;
1990 ONE_MORE_BYTE (c1);
1991 if (c1 != ISO_CODE_ESC)
1992 goto label_invalid_code;
1993 ONE_MORE_BYTE (c1);
1994 goto label_escape_sequence;
1995
1996 case '$': /* designation of 2-byte character set */
1997 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1998 goto label_invalid_code;
1999 ONE_MORE_BYTE (c1);
2000 if (c1 >= '@' && c1 <= 'B')
2001 { /* designation of JISX0208.1978, GB2312.1980,
2002 or JISX0208.1980 */
2003 DECODE_DESIGNATION (0, 2, 94, c1);
2004 }
2005 else if (c1 >= 0x28 && c1 <= 0x2B)
2006 { /* designation of DIMENSION2_CHARS94 character set */
2007 ONE_MORE_BYTE (c2);
2008 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2009 }
2010 else if (c1 >= 0x2C && c1 <= 0x2F)
2011 { /* designation of DIMENSION2_CHARS96 character set */
2012 ONE_MORE_BYTE (c2);
2013 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2014 }
2015 else
2016 goto label_invalid_code;
2017 /* We must update these variables now. */
2018 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2019 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2020 continue;
2021
2022 case 'n': /* invocation of locking-shift-2 */
2023 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2024 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2025 goto label_invalid_code;
2026 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2027 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2028 continue;
2029
2030 case 'o': /* invocation of locking-shift-3 */
2031 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2032 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2033 goto label_invalid_code;
2034 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2035 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2036 continue;
2037
2038 case 'N': /* invocation of single-shift-2 */
2039 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2040 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2041 goto label_invalid_code;
2042 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2043 ONE_MORE_BYTE (c1);
2044 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2045 goto label_invalid_code;
2046 break;
2047
2048 case 'O': /* invocation of single-shift-3 */
2049 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2050 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2051 goto label_invalid_code;
2052 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2053 ONE_MORE_BYTE (c1);
2054 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2055 goto label_invalid_code;
2056 break;
2057
2058 case '0': case '2': case '3': case '4': /* start composition */
2059 DECODE_COMPOSITION_START (c1);
2060 continue;
2061
2062 case '1': /* end composition */
2063 DECODE_COMPOSITION_END (c1);
2064 continue;
2065
2066 case '[': /* specification of direction */
2067 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2068 goto label_invalid_code;
2069 /* For the moment, nested direction is not supported.
2070 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2071 left-to-right, and nonzero means right-to-left. */
2072 ONE_MORE_BYTE (c1);
2073 switch (c1)
2074 {
2075 case ']': /* end of the current direction */
2076 coding->mode &= ~CODING_MODE_DIRECTION;
2077
2078 case '0': /* end of the current direction */
2079 case '1': /* start of left-to-right direction */
2080 ONE_MORE_BYTE (c1);
2081 if (c1 == ']')
2082 coding->mode &= ~CODING_MODE_DIRECTION;
2083 else
2084 goto label_invalid_code;
2085 break;
2086
2087 case '2': /* start of right-to-left direction */
2088 ONE_MORE_BYTE (c1);
2089 if (c1 == ']')
2090 coding->mode |= CODING_MODE_DIRECTION;
2091 else
2092 goto label_invalid_code;
2093 break;
2094
2095 default:
2096 goto label_invalid_code;
2097 }
2098 continue;
2099
2100 case '%':
2101 if (COMPOSING_P (coding))
2102 DECODE_COMPOSITION_END ('1');
2103 ONE_MORE_BYTE (c1);
2104 if (c1 == '/')
2105 {
2106 /* CTEXT extended segment:
2107 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2108 We keep these bytes as is for the moment.
2109 They may be decoded by post-read-conversion. */
2110 int dim, M, L;
2111 int size, required;
2112 int produced_chars;
2113
2114 ONE_MORE_BYTE (dim);
2115 ONE_MORE_BYTE (M);
2116 ONE_MORE_BYTE (L);
2117 size = ((M - 128) * 128) + (L - 128);
2118 required = 8 + size * 2;
2119 if (dst + required > (dst_bytes ? dst_end : src))
2120 goto label_end_of_loop;
2121 *dst++ = ISO_CODE_ESC;
2122 *dst++ = '%';
2123 *dst++ = '/';
2124 *dst++ = dim;
2125 produced_chars = 4;
2126 dst += CHAR_STRING (M, dst), produced_chars++;
2127 dst += CHAR_STRING (L, dst), produced_chars++;
2128 while (size-- > 0)
2129 {
2130 ONE_MORE_BYTE (c1);
2131 dst += CHAR_STRING (c1, dst), produced_chars++;
2132 }
2133 coding->produced_char += produced_chars;
2134 }
2135 else if (c1 == 'G')
2136 {
2137 unsigned char *d = dst;
2138 int produced_chars;
2139
2140 /* XFree86 extension for embedding UTF-8 in CTEXT:
2141 ESC % G --UTF-8-BYTES-- ESC % @
2142 We keep these bytes as is for the moment.
2143 They may be decoded by post-read-conversion. */
2144 if (d + 6 > (dst_bytes ? dst_end : src))
2145 goto label_end_of_loop;
2146 *d++ = ISO_CODE_ESC;
2147 *d++ = '%';
2148 *d++ = 'G';
2149 produced_chars = 3;
2150 while (d + 1 < (dst_bytes ? dst_end : src))
2151 {
2152 ONE_MORE_BYTE (c1);
2153 if (c1 == ISO_CODE_ESC
2154 && src + 1 < src_end
2155 && src[0] == '%'
2156 && src[1] == '@')
2157 {
2158 src += 2;
2159 break;
2160 }
2161 d += CHAR_STRING (c1, d), produced_chars++;
2162 }
2163 if (d + 3 > (dst_bytes ? dst_end : src))
2164 goto label_end_of_loop;
2165 *d++ = ISO_CODE_ESC;
2166 *d++ = '%';
2167 *d++ = '@';
2168 dst = d;
2169 coding->produced_char += produced_chars + 3;
2170 }
2171 else
2172 goto label_invalid_code;
2173 continue;
2174
2175 default:
2176 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2177 goto label_invalid_code;
2178 if (c1 >= 0x28 && c1 <= 0x2B)
2179 { /* designation of DIMENSION1_CHARS94 character set */
2180 ONE_MORE_BYTE (c2);
2181 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2182 }
2183 else if (c1 >= 0x2C && c1 <= 0x2F)
2184 { /* designation of DIMENSION1_CHARS96 character set */
2185 ONE_MORE_BYTE (c2);
2186 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2187 }
2188 else
2189 goto label_invalid_code;
2190 /* We must update these variables now. */
2191 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2192 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2193 continue;
2194 }
2195 }
2196
2197 /* Now we know CHARSET and 1st position code C1 of a character.
2198 Produce a multibyte sequence for that character while getting
2199 2nd position code C2 if necessary. */
2200 if (CHARSET_DIMENSION (charset) == 2)
2201 {
2202 ONE_MORE_BYTE (c2);
2203 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2204 /* C2 is not in a valid range. */
2205 goto label_invalid_code;
2206 }
2207 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2208 EMIT_CHAR (c);
2209 continue;
2210
2211 label_invalid_code:
2212 coding->errors++;
2213 if (COMPOSING_P (coding))
2214 DECODE_COMPOSITION_END ('1');
2215 src = src_base;
2216 c = *src++;
2217 if (! NILP (translation_table))
2218 c = translate_char (translation_table, c, 0, 0, 0);
2219 EMIT_CHAR (c);
2220 }
2221
2222 label_end_of_loop:
2223 coding->consumed = coding->consumed_char = src_base - source;
2224 coding->produced = dst - destination;
2225 return;
2226 }
2227
2228
2229 /* ISO2022 encoding stuff. */
2230
2231 /*
2232 It is not enough to say just "ISO2022" on encoding, we have to
2233 specify more details. In Emacs, each ISO2022 coding system
2234 variant has the following specifications:
2235 1. Initial designation to G0 through G3.
2236 2. Allows short-form designation?
2237 3. ASCII should be designated to G0 before control characters?
2238 4. ASCII should be designated to G0 at end of line?
2239 5. 7-bit environment or 8-bit environment?
2240 6. Use locking-shift?
2241 7. Use Single-shift?
2242 And the following two are only for Japanese:
2243 8. Use ASCII in place of JIS0201-1976-Roman?
2244 9. Use JISX0208-1983 in place of JISX0208-1978?
2245 These specifications are encoded in `coding->flags' as flag bits
2246 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2247 details.
2248 */
2249
2250 /* Produce codes (escape sequence) for designating CHARSET to graphic
2251 register REG at DST, and increment DST. If <final-char> of CHARSET is
2252 '@', 'A', or 'B' and the coding system CODING allows, produce
2253 designation sequence of short-form. */
2254
2255 #define ENCODE_DESIGNATION(charset, reg, coding) \
2256 do { \
2257 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2258 char *intermediate_char_94 = "()*+"; \
2259 char *intermediate_char_96 = ",-./"; \
2260 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2261 \
2262 if (revision < 255) \
2263 { \
2264 *dst++ = ISO_CODE_ESC; \
2265 *dst++ = '&'; \
2266 *dst++ = '@' + revision; \
2267 } \
2268 *dst++ = ISO_CODE_ESC; \
2269 if (CHARSET_DIMENSION (charset) == 1) \
2270 { \
2271 if (CHARSET_CHARS (charset) == 94) \
2272 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2273 else \
2274 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2275 } \
2276 else \
2277 { \
2278 *dst++ = '$'; \
2279 if (CHARSET_CHARS (charset) == 94) \
2280 { \
2281 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2282 || reg != 0 \
2283 || final_char < '@' || final_char > 'B') \
2284 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2285 } \
2286 else \
2287 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2288 } \
2289 *dst++ = final_char; \
2290 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2291 } while (0)
2292
2293 /* The following two macros produce codes (control character or escape
2294 sequence) for ISO2022 single-shift functions (single-shift-2 and
2295 single-shift-3). */
2296
2297 #define ENCODE_SINGLE_SHIFT_2 \
2298 do { \
2299 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2300 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2301 else \
2302 *dst++ = ISO_CODE_SS2; \
2303 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2304 } while (0)
2305
2306 #define ENCODE_SINGLE_SHIFT_3 \
2307 do { \
2308 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2309 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2310 else \
2311 *dst++ = ISO_CODE_SS3; \
2312 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2313 } while (0)
2314
2315 /* The following four macros produce codes (control character or
2316 escape sequence) for ISO2022 locking-shift functions (shift-in,
2317 shift-out, locking-shift-2, and locking-shift-3). */
2318
2319 #define ENCODE_SHIFT_IN \
2320 do { \
2321 *dst++ = ISO_CODE_SI; \
2322 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2323 } while (0)
2324
2325 #define ENCODE_SHIFT_OUT \
2326 do { \
2327 *dst++ = ISO_CODE_SO; \
2328 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2329 } while (0)
2330
2331 #define ENCODE_LOCKING_SHIFT_2 \
2332 do { \
2333 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2334 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2335 } while (0)
2336
2337 #define ENCODE_LOCKING_SHIFT_3 \
2338 do { \
2339 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2340 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2341 } while (0)
2342
2343 /* Produce codes for a DIMENSION1 character whose character set is
2344 CHARSET and whose position-code is C1. Designation and invocation
2345 sequences are also produced in advance if necessary. */
2346
2347 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2348 do { \
2349 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2350 { \
2351 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2352 *dst++ = c1 & 0x7F; \
2353 else \
2354 *dst++ = c1 | 0x80; \
2355 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2356 break; \
2357 } \
2358 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2359 { \
2360 *dst++ = c1 & 0x7F; \
2361 break; \
2362 } \
2363 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2364 { \
2365 *dst++ = c1 | 0x80; \
2366 break; \
2367 } \
2368 else \
2369 /* Since CHARSET is not yet invoked to any graphic planes, we \
2370 must invoke it, or, at first, designate it to some graphic \
2371 register. Then repeat the loop to actually produce the \
2372 character. */ \
2373 dst = encode_invocation_designation (charset, coding, dst); \
2374 } while (1)
2375
2376 /* Produce codes for a DIMENSION2 character whose character set is
2377 CHARSET and whose position-codes are C1 and C2. Designation and
2378 invocation codes are also produced in advance if necessary. */
2379
2380 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2381 do { \
2382 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2383 { \
2384 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2385 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2386 else \
2387 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2388 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2389 break; \
2390 } \
2391 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2392 { \
2393 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2394 break; \
2395 } \
2396 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2397 { \
2398 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2399 break; \
2400 } \
2401 else \
2402 /* Since CHARSET is not yet invoked to any graphic planes, we \
2403 must invoke it, or, at first, designate it to some graphic \
2404 register. Then repeat the loop to actually produce the \
2405 character. */ \
2406 dst = encode_invocation_designation (charset, coding, dst); \
2407 } while (1)
2408
2409 #define ENCODE_ISO_CHARACTER(c) \
2410 do { \
2411 int charset, c1, c2; \
2412 \
2413 SPLIT_CHAR (c, charset, c1, c2); \
2414 if (CHARSET_DEFINED_P (charset)) \
2415 { \
2416 if (CHARSET_DIMENSION (charset) == 1) \
2417 { \
2418 if (charset == CHARSET_ASCII \
2419 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2420 charset = charset_latin_jisx0201; \
2421 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2422 } \
2423 else \
2424 { \
2425 if (charset == charset_jisx0208 \
2426 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2427 charset = charset_jisx0208_1978; \
2428 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2429 } \
2430 } \
2431 else \
2432 { \
2433 *dst++ = c1; \
2434 if (c2 >= 0) \
2435 *dst++ = c2; \
2436 } \
2437 } while (0)
2438
2439
2440 /* Instead of encoding character C, produce one or two `?'s. */
2441
2442 #define ENCODE_UNSAFE_CHARACTER(c) \
2443 do { \
2444 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2445 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2446 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2447 } while (0)
2448
2449
2450 /* Produce designation and invocation codes at a place pointed by DST
2451 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2452 Return new DST. */
2453
2454 unsigned char *
2455 encode_invocation_designation (charset, coding, dst)
2456 int charset;
2457 struct coding_system *coding;
2458 unsigned char *dst;
2459 {
2460 int reg; /* graphic register number */
2461
2462 /* At first, check designations. */
2463 for (reg = 0; reg < 4; reg++)
2464 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2465 break;
2466
2467 if (reg >= 4)
2468 {
2469 /* CHARSET is not yet designated to any graphic registers. */
2470 /* At first check the requested designation. */
2471 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2472 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2473 /* Since CHARSET requests no special designation, designate it
2474 to graphic register 0. */
2475 reg = 0;
2476
2477 ENCODE_DESIGNATION (charset, reg, coding);
2478 }
2479
2480 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2481 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2482 {
2483 /* Since the graphic register REG is not invoked to any graphic
2484 planes, invoke it to graphic plane 0. */
2485 switch (reg)
2486 {
2487 case 0: /* graphic register 0 */
2488 ENCODE_SHIFT_IN;
2489 break;
2490
2491 case 1: /* graphic register 1 */
2492 ENCODE_SHIFT_OUT;
2493 break;
2494
2495 case 2: /* graphic register 2 */
2496 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2497 ENCODE_SINGLE_SHIFT_2;
2498 else
2499 ENCODE_LOCKING_SHIFT_2;
2500 break;
2501
2502 case 3: /* graphic register 3 */
2503 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2504 ENCODE_SINGLE_SHIFT_3;
2505 else
2506 ENCODE_LOCKING_SHIFT_3;
2507 break;
2508 }
2509 }
2510
2511 return dst;
2512 }
2513
2514 /* Produce 2-byte codes for encoded composition rule RULE. */
2515
2516 #define ENCODE_COMPOSITION_RULE(rule) \
2517 do { \
2518 int gref, nref; \
2519 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2520 *dst++ = 32 + 81 + gref; \
2521 *dst++ = 32 + nref; \
2522 } while (0)
2523
2524 /* Produce codes for indicating the start of a composition sequence
2525 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2526 which specify information about the composition. See the comment
2527 in coding.h for the format of DATA. */
2528
2529 #define ENCODE_COMPOSITION_START(coding, data) \
2530 do { \
2531 coding->composing = data[3]; \
2532 *dst++ = ISO_CODE_ESC; \
2533 if (coding->composing == COMPOSITION_RELATIVE) \
2534 *dst++ = '0'; \
2535 else \
2536 { \
2537 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2538 ? '3' : '4'); \
2539 coding->cmp_data_index = coding->cmp_data_start + 4; \
2540 coding->composition_rule_follows = 0; \
2541 } \
2542 } while (0)
2543
2544 /* Produce codes for indicating the end of the current composition. */
2545
2546 #define ENCODE_COMPOSITION_END(coding, data) \
2547 do { \
2548 *dst++ = ISO_CODE_ESC; \
2549 *dst++ = '1'; \
2550 coding->cmp_data_start += data[0]; \
2551 coding->composing = COMPOSITION_NO; \
2552 if (coding->cmp_data_start == coding->cmp_data->used \
2553 && coding->cmp_data->next) \
2554 { \
2555 coding->cmp_data = coding->cmp_data->next; \
2556 coding->cmp_data_start = 0; \
2557 } \
2558 } while (0)
2559
2560 /* Produce composition start sequence ESC 0. Here, this sequence
2561 doesn't mean the start of a new composition but means that we have
2562 just produced components (alternate chars and composition rules) of
2563 the composition and the actual text follows in SRC. */
2564
2565 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2566 do { \
2567 *dst++ = ISO_CODE_ESC; \
2568 *dst++ = '0'; \
2569 coding->composing = COMPOSITION_RELATIVE; \
2570 } while (0)
2571
2572 /* The following three macros produce codes for indicating direction
2573 of text. */
2574 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2575 do { \
2576 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2577 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2578 else \
2579 *dst++ = ISO_CODE_CSI; \
2580 } while (0)
2581
2582 #define ENCODE_DIRECTION_R2L \
2583 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2584
2585 #define ENCODE_DIRECTION_L2R \
2586 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2587
2588 /* Produce codes for designation and invocation to reset the graphic
2589 planes and registers to initial state. */
2590 #define ENCODE_RESET_PLANE_AND_REGISTER \
2591 do { \
2592 int reg; \
2593 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2594 ENCODE_SHIFT_IN; \
2595 for (reg = 0; reg < 4; reg++) \
2596 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2597 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2598 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2599 ENCODE_DESIGNATION \
2600 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2601 } while (0)
2602
2603 /* Produce designation sequences of charsets in the line started from
2604 SRC to a place pointed by DST, and return updated DST.
2605
2606 If the current block ends before any end-of-line, we may fail to
2607 find all the necessary designations. */
2608
2609 static unsigned char *
2610 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2611 struct coding_system *coding;
2612 Lisp_Object translation_table;
2613 const unsigned char *src, *src_end;
2614 unsigned char *dst;
2615 {
2616 int charset, c, found = 0, reg;
2617 /* Table of charsets to be designated to each graphic register. */
2618 int r[4];
2619
2620 for (reg = 0; reg < 4; reg++)
2621 r[reg] = -1;
2622
2623 while (found < 4)
2624 {
2625 ONE_MORE_CHAR (c);
2626 if (c == '\n')
2627 break;
2628
2629 charset = CHAR_CHARSET (c);
2630 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2631 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2632 {
2633 found++;
2634 r[reg] = charset;
2635 }
2636 }
2637
2638 label_end_of_loop:
2639 if (found)
2640 {
2641 for (reg = 0; reg < 4; reg++)
2642 if (r[reg] >= 0
2643 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2644 ENCODE_DESIGNATION (r[reg], reg, coding);
2645 }
2646
2647 return dst;
2648 }
2649
2650 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2651
2652 static void
2653 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2654 struct coding_system *coding;
2655 const unsigned char *source;
2656 unsigned char *destination;
2657 int src_bytes, dst_bytes;
2658 {
2659 const unsigned char *src = source;
2660 const unsigned char *src_end = source + src_bytes;
2661 unsigned char *dst = destination;
2662 unsigned char *dst_end = destination + dst_bytes;
2663 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2664 from DST_END to assure overflow checking is necessary only at the
2665 head of loop. */
2666 unsigned char *adjusted_dst_end = dst_end - 19;
2667 /* SRC_BASE remembers the start position in source in each loop.
2668 The loop will be exited when there's not enough source text to
2669 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2670 there's not enough destination area to produce encoded codes
2671 (within macro EMIT_BYTES). */
2672 const unsigned char *src_base;
2673 int c;
2674 Lisp_Object translation_table;
2675 Lisp_Object safe_chars;
2676
2677 if (coding->flags & CODING_FLAG_ISO_SAFE)
2678 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2679
2680 safe_chars = coding_safe_chars (coding->symbol);
2681
2682 if (NILP (Venable_character_translation))
2683 translation_table = Qnil;
2684 else
2685 {
2686 translation_table = coding->translation_table_for_encode;
2687 if (NILP (translation_table))
2688 translation_table = Vstandard_translation_table_for_encode;
2689 }
2690
2691 coding->consumed_char = 0;
2692 coding->errors = 0;
2693 while (1)
2694 {
2695 src_base = src;
2696
2697 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2698 {
2699 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2700 break;
2701 }
2702
2703 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2704 && CODING_SPEC_ISO_BOL (coding))
2705 {
2706 /* We have to produce designation sequences if any now. */
2707 dst = encode_designation_at_bol (coding, translation_table,
2708 src, src_end, dst);
2709 CODING_SPEC_ISO_BOL (coding) = 0;
2710 }
2711
2712 /* Check composition start and end. */
2713 if (coding->composing != COMPOSITION_DISABLED
2714 && coding->cmp_data_start < coding->cmp_data->used)
2715 {
2716 struct composition_data *cmp_data = coding->cmp_data;
2717 int *data = cmp_data->data + coding->cmp_data_start;
2718 int this_pos = cmp_data->char_offset + coding->consumed_char;
2719
2720 if (coding->composing == COMPOSITION_RELATIVE)
2721 {
2722 if (this_pos == data[2])
2723 {
2724 ENCODE_COMPOSITION_END (coding, data);
2725 cmp_data = coding->cmp_data;
2726 data = cmp_data->data + coding->cmp_data_start;
2727 }
2728 }
2729 else if (COMPOSING_P (coding))
2730 {
2731 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2732 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2733 /* We have consumed components of the composition.
2734 What follows in SRC is the composition's base
2735 text. */
2736 ENCODE_COMPOSITION_FAKE_START (coding);
2737 else
2738 {
2739 int c = cmp_data->data[coding->cmp_data_index++];
2740 if (coding->composition_rule_follows)
2741 {
2742 ENCODE_COMPOSITION_RULE (c);
2743 coding->composition_rule_follows = 0;
2744 }
2745 else
2746 {
2747 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2748 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2749 ENCODE_UNSAFE_CHARACTER (c);
2750 else
2751 ENCODE_ISO_CHARACTER (c);
2752 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2753 coding->composition_rule_follows = 1;
2754 }
2755 continue;
2756 }
2757 }
2758 if (!COMPOSING_P (coding))
2759 {
2760 if (this_pos == data[1])
2761 {
2762 ENCODE_COMPOSITION_START (coding, data);
2763 continue;
2764 }
2765 }
2766 }
2767
2768 ONE_MORE_CHAR (c);
2769
2770 /* Now encode the character C. */
2771 if (c < 0x20 || c == 0x7F)
2772 {
2773 if (c == '\r')
2774 {
2775 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2776 {
2777 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2778 ENCODE_RESET_PLANE_AND_REGISTER;
2779 *dst++ = c;
2780 continue;
2781 }
2782 /* fall down to treat '\r' as '\n' ... */
2783 c = '\n';
2784 }
2785 if (c == '\n')
2786 {
2787 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2788 ENCODE_RESET_PLANE_AND_REGISTER;
2789 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2790 bcopy (coding->spec.iso2022.initial_designation,
2791 coding->spec.iso2022.current_designation,
2792 sizeof coding->spec.iso2022.initial_designation);
2793 if (coding->eol_type == CODING_EOL_LF
2794 || coding->eol_type == CODING_EOL_UNDECIDED)
2795 *dst++ = ISO_CODE_LF;
2796 else if (coding->eol_type == CODING_EOL_CRLF)
2797 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2798 else
2799 *dst++ = ISO_CODE_CR;
2800 CODING_SPEC_ISO_BOL (coding) = 1;
2801 }
2802 else
2803 {
2804 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2805 ENCODE_RESET_PLANE_AND_REGISTER;
2806 *dst++ = c;
2807 }
2808 }
2809 else if (ASCII_BYTE_P (c))
2810 ENCODE_ISO_CHARACTER (c);
2811 else if (SINGLE_BYTE_CHAR_P (c))
2812 {
2813 *dst++ = c;
2814 coding->errors++;
2815 }
2816 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2817 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2818 ENCODE_UNSAFE_CHARACTER (c);
2819 else
2820 ENCODE_ISO_CHARACTER (c);
2821
2822 coding->consumed_char++;
2823 }
2824
2825 label_end_of_loop:
2826 coding->consumed = src_base - source;
2827 coding->produced = coding->produced_char = dst - destination;
2828 }
2829
2830 \f
2831 /*** 4. SJIS and BIG5 handlers ***/
2832
2833 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2834 quite widely. So, for the moment, Emacs supports them in the bare
2835 C code. But, in the future, they may be supported only by CCL. */
2836
2837 /* SJIS is a coding system encoding three character sets: ASCII, right
2838 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2839 as is. A character of charset katakana-jisx0201 is encoded by
2840 "position-code + 0x80". A character of charset japanese-jisx0208
2841 is encoded in 2-byte but two position-codes are divided and shifted
2842 so that it fits in the range below.
2843
2844 --- CODE RANGE of SJIS ---
2845 (character set) (range)
2846 ASCII 0x00 .. 0x7F
2847 KATAKANA-JISX0201 0xA1 .. 0xDF
2848 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2849 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2850 -------------------------------
2851
2852 */
2853
2854 /* BIG5 is a coding system encoding two character sets: ASCII and
2855 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2856 character set and is encoded in two bytes.
2857
2858 --- CODE RANGE of BIG5 ---
2859 (character set) (range)
2860 ASCII 0x00 .. 0x7F
2861 Big5 (1st byte) 0xA1 .. 0xFE
2862 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2863 --------------------------
2864
2865 Since the number of characters in Big5 is larger than maximum
2866 characters in Emacs' charset (96x96), it can't be handled as one
2867 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2868 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2869 contains frequently used characters and the latter contains less
2870 frequently used characters. */
2871
2872 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2873 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2874 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2875 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2876
2877 /* Number of Big5 characters which have the same code in 1st byte. */
2878 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2879
2880 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2881 do { \
2882 unsigned int temp \
2883 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2884 if (b1 < 0xC9) \
2885 charset = charset_big5_1; \
2886 else \
2887 { \
2888 charset = charset_big5_2; \
2889 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2890 } \
2891 c1 = temp / (0xFF - 0xA1) + 0x21; \
2892 c2 = temp % (0xFF - 0xA1) + 0x21; \
2893 } while (0)
2894
2895 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2896 do { \
2897 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2898 if (charset == charset_big5_2) \
2899 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2900 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2901 b2 = temp % BIG5_SAME_ROW; \
2902 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2903 } while (0)
2904
2905 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2906 Check if a text is encoded in SJIS. If it is, return
2907 CODING_CATEGORY_MASK_SJIS, else return 0. */
2908
2909 static int
2910 detect_coding_sjis (src, src_end, multibytep)
2911 unsigned char *src, *src_end;
2912 int multibytep;
2913 {
2914 int c;
2915 /* Dummy for ONE_MORE_BYTE. */
2916 struct coding_system dummy_coding;
2917 struct coding_system *coding = &dummy_coding;
2918
2919 while (1)
2920 {
2921 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2922 if (c < 0x80)
2923 continue;
2924 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2925 return 0;
2926 if (c <= 0x9F || c >= 0xE0)
2927 {
2928 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2929 if (c < 0x40 || c == 0x7F || c > 0xFC)
2930 return 0;
2931 }
2932 }
2933 }
2934
2935 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2936 Check if a text is encoded in BIG5. If it is, return
2937 CODING_CATEGORY_MASK_BIG5, else return 0. */
2938
2939 static int
2940 detect_coding_big5 (src, src_end, multibytep)
2941 unsigned char *src, *src_end;
2942 int multibytep;
2943 {
2944 int c;
2945 /* Dummy for ONE_MORE_BYTE. */
2946 struct coding_system dummy_coding;
2947 struct coding_system *coding = &dummy_coding;
2948
2949 while (1)
2950 {
2951 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2952 if (c < 0x80)
2953 continue;
2954 if (c < 0xA1 || c > 0xFE)
2955 return 0;
2956 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2957 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2958 return 0;
2959 }
2960 }
2961
2962 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2963 Check if a text is encoded in UTF-8. If it is, return
2964 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2965
2966 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2967 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2968 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2969 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2970 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2971 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2972 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2973
2974 static int
2975 detect_coding_utf_8 (src, src_end, multibytep)
2976 unsigned char *src, *src_end;
2977 int multibytep;
2978 {
2979 unsigned char c;
2980 int seq_maybe_bytes;
2981 /* Dummy for ONE_MORE_BYTE. */
2982 struct coding_system dummy_coding;
2983 struct coding_system *coding = &dummy_coding;
2984
2985 while (1)
2986 {
2987 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2988 if (UTF_8_1_OCTET_P (c))
2989 continue;
2990 else if (UTF_8_2_OCTET_LEADING_P (c))
2991 seq_maybe_bytes = 1;
2992 else if (UTF_8_3_OCTET_LEADING_P (c))
2993 seq_maybe_bytes = 2;
2994 else if (UTF_8_4_OCTET_LEADING_P (c))
2995 seq_maybe_bytes = 3;
2996 else if (UTF_8_5_OCTET_LEADING_P (c))
2997 seq_maybe_bytes = 4;
2998 else if (UTF_8_6_OCTET_LEADING_P (c))
2999 seq_maybe_bytes = 5;
3000 else
3001 return 0;
3002
3003 do
3004 {
3005 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3006 if (!UTF_8_EXTRA_OCTET_P (c))
3007 return 0;
3008 seq_maybe_bytes--;
3009 }
3010 while (seq_maybe_bytes > 0);
3011 }
3012 }
3013
3014 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3015 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3016 Little Endian (otherwise). If it is, return
3017 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3018 else return 0. */
3019
3020 #define UTF_16_INVALID_P(val) \
3021 (((val) == 0xFFFE) \
3022 || ((val) == 0xFFFF))
3023
3024 #define UTF_16_HIGH_SURROGATE_P(val) \
3025 (((val) & 0xD800) == 0xD800)
3026
3027 #define UTF_16_LOW_SURROGATE_P(val) \
3028 (((val) & 0xDC00) == 0xDC00)
3029
3030 static int
3031 detect_coding_utf_16 (src, src_end, multibytep)
3032 unsigned char *src, *src_end;
3033 int multibytep;
3034 {
3035 unsigned char c1, c2;
3036 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3037 struct coding_system dummy_coding;
3038 struct coding_system *coding = &dummy_coding;
3039
3040 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3041 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3042
3043 if ((c1 == 0xFF) && (c2 == 0xFE))
3044 return CODING_CATEGORY_MASK_UTF_16_LE;
3045 else if ((c1 == 0xFE) && (c2 == 0xFF))
3046 return CODING_CATEGORY_MASK_UTF_16_BE;
3047 return 0;
3048 }
3049
3050 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3051 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3052
3053 static void
3054 decode_coding_sjis_big5 (coding, source, destination,
3055 src_bytes, dst_bytes, sjis_p)
3056 struct coding_system *coding;
3057 const unsigned char *source;
3058 unsigned char *destination;
3059 int src_bytes, dst_bytes;
3060 int sjis_p;
3061 {
3062 const unsigned char *src = source;
3063 const unsigned char *src_end = source + src_bytes;
3064 unsigned char *dst = destination;
3065 unsigned char *dst_end = destination + dst_bytes;
3066 /* SRC_BASE remembers the start position in source in each loop.
3067 The loop will be exited when there's not enough source code
3068 (within macro ONE_MORE_BYTE), or when there's not enough
3069 destination area to produce a character (within macro
3070 EMIT_CHAR). */
3071 const unsigned char *src_base;
3072 Lisp_Object translation_table;
3073
3074 if (NILP (Venable_character_translation))
3075 translation_table = Qnil;
3076 else
3077 {
3078 translation_table = coding->translation_table_for_decode;
3079 if (NILP (translation_table))
3080 translation_table = Vstandard_translation_table_for_decode;
3081 }
3082
3083 coding->produced_char = 0;
3084 while (1)
3085 {
3086 int c, charset, c1, c2 = 0;
3087
3088 src_base = src;
3089 ONE_MORE_BYTE (c1);
3090
3091 if (c1 < 0x80)
3092 {
3093 charset = CHARSET_ASCII;
3094 if (c1 < 0x20)
3095 {
3096 if (c1 == '\r')
3097 {
3098 if (coding->eol_type == CODING_EOL_CRLF)
3099 {
3100 ONE_MORE_BYTE (c2);
3101 if (c2 == '\n')
3102 c1 = c2;
3103 else
3104 /* To process C2 again, SRC is subtracted by 1. */
3105 src--;
3106 }
3107 else if (coding->eol_type == CODING_EOL_CR)
3108 c1 = '\n';
3109 }
3110 else if (c1 == '\n'
3111 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3112 && (coding->eol_type == CODING_EOL_CR
3113 || coding->eol_type == CODING_EOL_CRLF))
3114 {
3115 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3116 goto label_end_of_loop;
3117 }
3118 }
3119 }
3120 else
3121 {
3122 if (sjis_p)
3123 {
3124 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3125 goto label_invalid_code;
3126 if (c1 <= 0x9F || c1 >= 0xE0)
3127 {
3128 /* SJIS -> JISX0208 */
3129 ONE_MORE_BYTE (c2);
3130 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3131 goto label_invalid_code;
3132 DECODE_SJIS (c1, c2, c1, c2);
3133 charset = charset_jisx0208;
3134 }
3135 else
3136 /* SJIS -> JISX0201-Kana */
3137 charset = charset_katakana_jisx0201;
3138 }
3139 else
3140 {
3141 /* BIG5 -> Big5 */
3142 if (c1 < 0xA0 || c1 > 0xFE)
3143 goto label_invalid_code;
3144 ONE_MORE_BYTE (c2);
3145 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3146 goto label_invalid_code;
3147 DECODE_BIG5 (c1, c2, charset, c1, c2);
3148 }
3149 }
3150
3151 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3152 EMIT_CHAR (c);
3153 continue;
3154
3155 label_invalid_code:
3156 coding->errors++;
3157 src = src_base;
3158 c = *src++;
3159 EMIT_CHAR (c);
3160 }
3161
3162 label_end_of_loop:
3163 coding->consumed = coding->consumed_char = src_base - source;
3164 coding->produced = dst - destination;
3165 return;
3166 }
3167
3168 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3169 This function can encode charsets `ascii', `katakana-jisx0201',
3170 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3171 are sure that all these charsets are registered as official charset
3172 (i.e. do not have extended leading-codes). Characters of other
3173 charsets are produced without any encoding. If SJIS_P is 1, encode
3174 SJIS text, else encode BIG5 text. */
3175
3176 static void
3177 encode_coding_sjis_big5 (coding, source, destination,
3178 src_bytes, dst_bytes, sjis_p)
3179 struct coding_system *coding;
3180 unsigned char *source, *destination;
3181 int src_bytes, dst_bytes;
3182 int sjis_p;
3183 {
3184 unsigned char *src = source;
3185 unsigned char *src_end = source + src_bytes;
3186 unsigned char *dst = destination;
3187 unsigned char *dst_end = destination + dst_bytes;
3188 /* SRC_BASE remembers the start position in source in each loop.
3189 The loop will be exited when there's not enough source text to
3190 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3191 there's not enough destination area to produce encoded codes
3192 (within macro EMIT_BYTES). */
3193 unsigned char *src_base;
3194 Lisp_Object translation_table;
3195
3196 if (NILP (Venable_character_translation))
3197 translation_table = Qnil;
3198 else
3199 {
3200 translation_table = coding->translation_table_for_encode;
3201 if (NILP (translation_table))
3202 translation_table = Vstandard_translation_table_for_encode;
3203 }
3204
3205 while (1)
3206 {
3207 int c, charset, c1, c2;
3208
3209 src_base = src;
3210 ONE_MORE_CHAR (c);
3211
3212 /* Now encode the character C. */
3213 if (SINGLE_BYTE_CHAR_P (c))
3214 {
3215 switch (c)
3216 {
3217 case '\r':
3218 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3219 {
3220 EMIT_ONE_BYTE (c);
3221 break;
3222 }
3223 c = '\n';
3224 case '\n':
3225 if (coding->eol_type == CODING_EOL_CRLF)
3226 {
3227 EMIT_TWO_BYTES ('\r', c);
3228 break;
3229 }
3230 else if (coding->eol_type == CODING_EOL_CR)
3231 c = '\r';
3232 default:
3233 EMIT_ONE_BYTE (c);
3234 }
3235 }
3236 else
3237 {
3238 SPLIT_CHAR (c, charset, c1, c2);
3239 if (sjis_p)
3240 {
3241 if (charset == charset_jisx0208
3242 || charset == charset_jisx0208_1978)
3243 {
3244 ENCODE_SJIS (c1, c2, c1, c2);
3245 EMIT_TWO_BYTES (c1, c2);
3246 }
3247 else if (charset == charset_katakana_jisx0201)
3248 EMIT_ONE_BYTE (c1 | 0x80);
3249 else if (charset == charset_latin_jisx0201)
3250 EMIT_ONE_BYTE (c1);
3251 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3252 {
3253 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3254 if (CHARSET_WIDTH (charset) > 1)
3255 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3256 }
3257 else
3258 /* There's no way other than producing the internal
3259 codes as is. */
3260 EMIT_BYTES (src_base, src);
3261 }
3262 else
3263 {
3264 if (charset == charset_big5_1 || charset == charset_big5_2)
3265 {
3266 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3267 EMIT_TWO_BYTES (c1, c2);
3268 }
3269 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3270 {
3271 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3272 if (CHARSET_WIDTH (charset) > 1)
3273 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3274 }
3275 else
3276 /* There's no way other than producing the internal
3277 codes as is. */
3278 EMIT_BYTES (src_base, src);
3279 }
3280 }
3281 coding->consumed_char++;
3282 }
3283
3284 label_end_of_loop:
3285 coding->consumed = src_base - source;
3286 coding->produced = coding->produced_char = dst - destination;
3287 }
3288
3289 \f
3290 /*** 5. CCL handlers ***/
3291
3292 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3293 Check if a text is encoded in a coding system of which
3294 encoder/decoder are written in CCL program. If it is, return
3295 CODING_CATEGORY_MASK_CCL, else return 0. */
3296
3297 static int
3298 detect_coding_ccl (src, src_end, multibytep)
3299 unsigned char *src, *src_end;
3300 int multibytep;
3301 {
3302 unsigned char *valid;
3303 int c;
3304 /* Dummy for ONE_MORE_BYTE. */
3305 struct coding_system dummy_coding;
3306 struct coding_system *coding = &dummy_coding;
3307
3308 /* No coding system is assigned to coding-category-ccl. */
3309 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3310 return 0;
3311
3312 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3313 while (1)
3314 {
3315 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3316 if (! valid[c])
3317 return 0;
3318 }
3319 }
3320
3321 \f
3322 /*** 6. End-of-line handlers ***/
3323
3324 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3325
3326 static void
3327 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3328 struct coding_system *coding;
3329 const unsigned char *source;
3330 unsigned char *destination;
3331 int src_bytes, dst_bytes;
3332 {
3333 const unsigned char *src = source;
3334 unsigned char *dst = destination;
3335 const unsigned char *src_end = src + src_bytes;
3336 unsigned char *dst_end = dst + dst_bytes;
3337 Lisp_Object translation_table;
3338 /* SRC_BASE remembers the start position in source in each loop.
3339 The loop will be exited when there's not enough source code
3340 (within macro ONE_MORE_BYTE), or when there's not enough
3341 destination area to produce a character (within macro
3342 EMIT_CHAR). */
3343 const unsigned char *src_base;
3344 int c;
3345
3346 translation_table = Qnil;
3347 switch (coding->eol_type)
3348 {
3349 case CODING_EOL_CRLF:
3350 while (1)
3351 {
3352 src_base = src;
3353 ONE_MORE_BYTE (c);
3354 if (c == '\r')
3355 {
3356 ONE_MORE_BYTE (c);
3357 if (c != '\n')
3358 {
3359 src--;
3360 c = '\r';
3361 }
3362 }
3363 else if (c == '\n'
3364 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3365 {
3366 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3367 goto label_end_of_loop;
3368 }
3369 EMIT_CHAR (c);
3370 }
3371 break;
3372
3373 case CODING_EOL_CR:
3374 while (1)
3375 {
3376 src_base = src;
3377 ONE_MORE_BYTE (c);
3378 if (c == '\n')
3379 {
3380 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3381 {
3382 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3383 goto label_end_of_loop;
3384 }
3385 }
3386 else if (c == '\r')
3387 c = '\n';
3388 EMIT_CHAR (c);
3389 }
3390 break;
3391
3392 default: /* no need for EOL handling */
3393 while (1)
3394 {
3395 src_base = src;
3396 ONE_MORE_BYTE (c);
3397 EMIT_CHAR (c);
3398 }
3399 }
3400
3401 label_end_of_loop:
3402 coding->consumed = coding->consumed_char = src_base - source;
3403 coding->produced = dst - destination;
3404 return;
3405 }
3406
3407 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3408 format of end-of-line according to `coding->eol_type'. It also
3409 convert multibyte form 8-bit characters to unibyte if
3410 CODING->src_multibyte is nonzero. If `coding->mode &
3411 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3412 also means end-of-line. */
3413
3414 static void
3415 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3416 struct coding_system *coding;
3417 const unsigned char *source;
3418 unsigned char *destination;
3419 int src_bytes, dst_bytes;
3420 {
3421 const unsigned char *src = source;
3422 unsigned char *dst = destination;
3423 const unsigned char *src_end = src + src_bytes;
3424 unsigned char *dst_end = dst + dst_bytes;
3425 Lisp_Object translation_table;
3426 /* SRC_BASE remembers the start position in source in each loop.
3427 The loop will be exited when there's not enough source text to
3428 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3429 there's not enough destination area to produce encoded codes
3430 (within macro EMIT_BYTES). */
3431 const unsigned char *src_base;
3432 unsigned char *tmp;
3433 int c;
3434 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3435
3436 translation_table = Qnil;
3437 if (coding->src_multibyte
3438 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3439 {
3440 src_end--;
3441 src_bytes--;
3442 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3443 }
3444
3445 if (coding->eol_type == CODING_EOL_CRLF)
3446 {
3447 while (src < src_end)
3448 {
3449 src_base = src;
3450 c = *src++;
3451 if (c >= 0x20)
3452 EMIT_ONE_BYTE (c);
3453 else if (c == '\n' || (c == '\r' && selective_display))
3454 EMIT_TWO_BYTES ('\r', '\n');
3455 else
3456 EMIT_ONE_BYTE (c);
3457 }
3458 src_base = src;
3459 label_end_of_loop:
3460 ;
3461 }
3462 else
3463 {
3464 if (!dst_bytes || src_bytes <= dst_bytes)
3465 {
3466 safe_bcopy (src, dst, src_bytes);
3467 src_base = src_end;
3468 dst += src_bytes;
3469 }
3470 else
3471 {
3472 if (coding->src_multibyte
3473 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3474 dst_bytes--;
3475 safe_bcopy (src, dst, dst_bytes);
3476 src_base = src + dst_bytes;
3477 dst = destination + dst_bytes;
3478 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3479 }
3480 if (coding->eol_type == CODING_EOL_CR)
3481 {
3482 for (tmp = destination; tmp < dst; tmp++)
3483 if (*tmp == '\n') *tmp = '\r';
3484 }
3485 else if (selective_display)
3486 {
3487 for (tmp = destination; tmp < dst; tmp++)
3488 if (*tmp == '\r') *tmp = '\n';
3489 }
3490 }
3491 if (coding->src_multibyte)
3492 dst = destination + str_as_unibyte (destination, dst - destination);
3493
3494 coding->consumed = src_base - source;
3495 coding->produced = dst - destination;
3496 coding->produced_char = coding->produced;
3497 }
3498
3499 \f
3500 /*** 7. C library functions ***/
3501
3502 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3503 has a property `coding-system'. The value of this property is a
3504 vector of length 5 (called the coding-vector). Among elements of
3505 this vector, the first (element[0]) and the fifth (element[4])
3506 carry important information for decoding/encoding. Before
3507 decoding/encoding, this information should be set in fields of a
3508 structure of type `coding_system'.
3509
3510 The value of the property `coding-system' can be a symbol of another
3511 subsidiary coding-system. In that case, Emacs gets coding-vector
3512 from that symbol.
3513
3514 `element[0]' contains information to be set in `coding->type'. The
3515 value and its meaning is as follows:
3516
3517 0 -- coding_type_emacs_mule
3518 1 -- coding_type_sjis
3519 2 -- coding_type_iso2022
3520 3 -- coding_type_big5
3521 4 -- coding_type_ccl encoder/decoder written in CCL
3522 nil -- coding_type_no_conversion
3523 t -- coding_type_undecided (automatic conversion on decoding,
3524 no-conversion on encoding)
3525
3526 `element[4]' contains information to be set in `coding->flags' and
3527 `coding->spec'. The meaning varies by `coding->type'.
3528
3529 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3530 of length 32 (of which the first 13 sub-elements are used now).
3531 Meanings of these sub-elements are:
3532
3533 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3534 If the value is an integer of valid charset, the charset is
3535 assumed to be designated to graphic register N initially.
3536
3537 If the value is minus, it is a minus value of charset which
3538 reserves graphic register N, which means that the charset is
3539 not designated initially but should be designated to graphic
3540 register N just before encoding a character in that charset.
3541
3542 If the value is nil, graphic register N is never used on
3543 encoding.
3544
3545 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3546 Each value takes t or nil. See the section ISO2022 of
3547 `coding.h' for more information.
3548
3549 If `coding->type' is `coding_type_big5', element[4] is t to denote
3550 BIG5-ETen or nil to denote BIG5-HKU.
3551
3552 If `coding->type' takes the other value, element[4] is ignored.
3553
3554 Emacs Lisp's coding systems also carry information about format of
3555 end-of-line in a value of property `eol-type'. If the value is
3556 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3557 means CODING_EOL_CR. If it is not integer, it should be a vector
3558 of subsidiary coding systems of which property `eol-type' has one
3559 of the above values.
3560
3561 */
3562
3563 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3564 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3565 is setup so that no conversion is necessary and return -1, else
3566 return 0. */
3567
3568 int
3569 setup_coding_system (coding_system, coding)
3570 Lisp_Object coding_system;
3571 struct coding_system *coding;
3572 {
3573 Lisp_Object coding_spec, coding_type, eol_type, plist;
3574 Lisp_Object val;
3575
3576 /* At first, zero clear all members. */
3577 bzero (coding, sizeof (struct coding_system));
3578
3579 /* Initialize some fields required for all kinds of coding systems. */
3580 coding->symbol = coding_system;
3581 coding->heading_ascii = -1;
3582 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3583 coding->composing = COMPOSITION_DISABLED;
3584 coding->cmp_data = NULL;
3585
3586 if (NILP (coding_system))
3587 goto label_invalid_coding_system;
3588
3589 coding_spec = Fget (coding_system, Qcoding_system);
3590
3591 if (!VECTORP (coding_spec)
3592 || XVECTOR (coding_spec)->size != 5
3593 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3594 goto label_invalid_coding_system;
3595
3596 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3597 if (VECTORP (eol_type))
3598 {
3599 coding->eol_type = CODING_EOL_UNDECIDED;
3600 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3601 if (system_eol_type != CODING_EOL_LF)
3602 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3603 }
3604 else if (XFASTINT (eol_type) == 1)
3605 {
3606 coding->eol_type = CODING_EOL_CRLF;
3607 coding->common_flags
3608 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3609 }
3610 else if (XFASTINT (eol_type) == 2)
3611 {
3612 coding->eol_type = CODING_EOL_CR;
3613 coding->common_flags
3614 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3615 }
3616 else
3617 {
3618 coding->common_flags = 0;
3619 coding->eol_type = CODING_EOL_LF;
3620 }
3621
3622 coding_type = XVECTOR (coding_spec)->contents[0];
3623 /* Try short cut. */
3624 if (SYMBOLP (coding_type))
3625 {
3626 if (EQ (coding_type, Qt))
3627 {
3628 coding->type = coding_type_undecided;
3629 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3630 }
3631 else
3632 coding->type = coding_type_no_conversion;
3633 /* Initialize this member. Any thing other than
3634 CODING_CATEGORY_IDX_UTF_16_BE and
3635 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3636 special treatment in detect_eol. */
3637 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3638
3639 return 0;
3640 }
3641
3642 /* Get values of coding system properties:
3643 `post-read-conversion', `pre-write-conversion',
3644 `translation-table-for-decode', `translation-table-for-encode'. */
3645 plist = XVECTOR (coding_spec)->contents[3];
3646 /* Pre & post conversion functions should be disabled if
3647 inhibit_eol_conversion is nonzero. This is the case that a code
3648 conversion function is called while those functions are running. */
3649 if (! inhibit_pre_post_conversion)
3650 {
3651 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3652 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3653 }
3654 val = Fplist_get (plist, Qtranslation_table_for_decode);
3655 if (SYMBOLP (val))
3656 val = Fget (val, Qtranslation_table_for_decode);
3657 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3658 val = Fplist_get (plist, Qtranslation_table_for_encode);
3659 if (SYMBOLP (val))
3660 val = Fget (val, Qtranslation_table_for_encode);
3661 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3662 val = Fplist_get (plist, Qcoding_category);
3663 if (!NILP (val))
3664 {
3665 val = Fget (val, Qcoding_category_index);
3666 if (INTEGERP (val))
3667 coding->category_idx = XINT (val);
3668 else
3669 goto label_invalid_coding_system;
3670 }
3671 else
3672 goto label_invalid_coding_system;
3673
3674 /* If the coding system has non-nil `composition' property, enable
3675 composition handling. */
3676 val = Fplist_get (plist, Qcomposition);
3677 if (!NILP (val))
3678 coding->composing = COMPOSITION_NO;
3679
3680 /* If the coding system is ascii-incompatible, record it in
3681 common_flags. */
3682 val = Fplist_get (plist, Qascii_incompatible);
3683 if (! NILP (val))
3684 coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3685
3686 switch (XFASTINT (coding_type))
3687 {
3688 case 0:
3689 coding->type = coding_type_emacs_mule;
3690 coding->common_flags
3691 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3692 if (!NILP (coding->post_read_conversion))
3693 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3694 if (!NILP (coding->pre_write_conversion))
3695 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3696 break;
3697
3698 case 1:
3699 coding->type = coding_type_sjis;
3700 coding->common_flags
3701 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3702 break;
3703
3704 case 2:
3705 coding->type = coding_type_iso2022;
3706 coding->common_flags
3707 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3708 {
3709 Lisp_Object val, temp;
3710 Lisp_Object *flags;
3711 int i, charset, reg_bits = 0;
3712
3713 val = XVECTOR (coding_spec)->contents[4];
3714
3715 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3716 goto label_invalid_coding_system;
3717
3718 flags = XVECTOR (val)->contents;
3719 coding->flags
3720 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3721 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3722 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3723 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3724 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3725 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3726 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3727 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3728 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3729 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3730 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3731 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3732 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3733 );
3734
3735 /* Invoke graphic register 0 to plane 0. */
3736 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3737 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3738 CODING_SPEC_ISO_INVOCATION (coding, 1)
3739 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3740 /* Not single shifting at first. */
3741 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3742 /* Beginning of buffer should also be regarded as bol. */
3743 CODING_SPEC_ISO_BOL (coding) = 1;
3744
3745 for (charset = 0; charset <= MAX_CHARSET; charset++)
3746 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3747 val = Vcharset_revision_alist;
3748 while (CONSP (val))
3749 {
3750 charset = get_charset_id (Fcar_safe (XCAR (val)));
3751 if (charset >= 0
3752 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3753 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3754 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3755 val = XCDR (val);
3756 }
3757
3758 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3759 FLAGS[REG] can be one of below:
3760 integer CHARSET: CHARSET occupies register I,
3761 t: designate nothing to REG initially, but can be used
3762 by any charsets,
3763 list of integer, nil, or t: designate the first
3764 element (if integer) to REG initially, the remaining
3765 elements (if integer) is designated to REG on request,
3766 if an element is t, REG can be used by any charsets,
3767 nil: REG is never used. */
3768 for (charset = 0; charset <= MAX_CHARSET; charset++)
3769 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3770 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3771 for (i = 0; i < 4; i++)
3772 {
3773 if ((INTEGERP (flags[i])
3774 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3775 || (charset = get_charset_id (flags[i])) >= 0)
3776 {
3777 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3778 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3779 }
3780 else if (EQ (flags[i], Qt))
3781 {
3782 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3783 reg_bits |= 1 << i;
3784 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3785 }
3786 else if (CONSP (flags[i]))
3787 {
3788 Lisp_Object tail;
3789 tail = flags[i];
3790
3791 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3792 if ((INTEGERP (XCAR (tail))
3793 && (charset = XINT (XCAR (tail)),
3794 CHARSET_VALID_P (charset)))
3795 || (charset = get_charset_id (XCAR (tail))) >= 0)
3796 {
3797 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3798 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3799 }
3800 else
3801 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3802 tail = XCDR (tail);
3803 while (CONSP (tail))
3804 {
3805 if ((INTEGERP (XCAR (tail))
3806 && (charset = XINT (XCAR (tail)),
3807 CHARSET_VALID_P (charset)))
3808 || (charset = get_charset_id (XCAR (tail))) >= 0)
3809 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3810 = i;
3811 else if (EQ (XCAR (tail), Qt))
3812 reg_bits |= 1 << i;
3813 tail = XCDR (tail);
3814 }
3815 }
3816 else
3817 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3818
3819 CODING_SPEC_ISO_DESIGNATION (coding, i)
3820 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3821 }
3822
3823 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3824 {
3825 /* REG 1 can be used only by locking shift in 7-bit env. */
3826 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3827 reg_bits &= ~2;
3828 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3829 /* Without any shifting, only REG 0 and 1 can be used. */
3830 reg_bits &= 3;
3831 }
3832
3833 if (reg_bits)
3834 for (charset = 0; charset <= MAX_CHARSET; charset++)
3835 {
3836 if (CHARSET_DEFINED_P (charset)
3837 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3838 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3839 {
3840 /* There exist some default graphic registers to be
3841 used by CHARSET. */
3842
3843 /* We had better avoid designating a charset of
3844 CHARS96 to REG 0 as far as possible. */
3845 if (CHARSET_CHARS (charset) == 96)
3846 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3847 = (reg_bits & 2
3848 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3849 else
3850 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3851 = (reg_bits & 1
3852 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3853 }
3854 }
3855 }
3856 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3857 coding->spec.iso2022.last_invalid_designation_register = -1;
3858 break;
3859
3860 case 3:
3861 coding->type = coding_type_big5;
3862 coding->common_flags
3863 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3864 coding->flags
3865 = (NILP (XVECTOR (coding_spec)->contents[4])
3866 ? CODING_FLAG_BIG5_HKU
3867 : CODING_FLAG_BIG5_ETEN);
3868 break;
3869
3870 case 4:
3871 coding->type = coding_type_ccl;
3872 coding->common_flags
3873 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3874 {
3875 val = XVECTOR (coding_spec)->contents[4];
3876 if (! CONSP (val)
3877 || setup_ccl_program (&(coding->spec.ccl.decoder),
3878 XCAR (val)) < 0
3879 || setup_ccl_program (&(coding->spec.ccl.encoder),
3880 XCDR (val)) < 0)
3881 goto label_invalid_coding_system;
3882
3883 bzero (coding->spec.ccl.valid_codes, 256);
3884 val = Fplist_get (plist, Qvalid_codes);
3885 if (CONSP (val))
3886 {
3887 Lisp_Object this;
3888
3889 for (; CONSP (val); val = XCDR (val))
3890 {
3891 this = XCAR (val);
3892 if (INTEGERP (this)
3893 && XINT (this) >= 0 && XINT (this) < 256)
3894 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3895 else if (CONSP (this)
3896 && INTEGERP (XCAR (this))
3897 && INTEGERP (XCDR (this)))
3898 {
3899 int start = XINT (XCAR (this));
3900 int end = XINT (XCDR (this));
3901
3902 if (start >= 0 && start <= end && end < 256)
3903 while (start <= end)
3904 coding->spec.ccl.valid_codes[start++] = 1;
3905 }
3906 }
3907 }
3908 }
3909 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3910 coding->spec.ccl.cr_carryover = 0;
3911 coding->spec.ccl.eight_bit_carryover[0] = 0;
3912 break;
3913
3914 case 5:
3915 coding->type = coding_type_raw_text;
3916 break;
3917
3918 default:
3919 goto label_invalid_coding_system;
3920 }
3921 return 0;
3922
3923 label_invalid_coding_system:
3924 coding->type = coding_type_no_conversion;
3925 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3926 coding->common_flags = 0;
3927 coding->eol_type = CODING_EOL_UNDECIDED;
3928 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3929 return NILP (coding_system) ? 0 : -1;
3930 }
3931
3932 /* Free memory blocks allocated for storing composition information. */
3933
3934 void
3935 coding_free_composition_data (coding)
3936 struct coding_system *coding;
3937 {
3938 struct composition_data *cmp_data = coding->cmp_data, *next;
3939
3940 if (!cmp_data)
3941 return;
3942 /* Memory blocks are chained. At first, rewind to the first, then,
3943 free blocks one by one. */
3944 while (cmp_data->prev)
3945 cmp_data = cmp_data->prev;
3946 while (cmp_data)
3947 {
3948 next = cmp_data->next;
3949 xfree (cmp_data);
3950 cmp_data = next;
3951 }
3952 coding->cmp_data = NULL;
3953 }
3954
3955 /* Set `char_offset' member of all memory blocks pointed by
3956 coding->cmp_data to POS. */
3957
3958 void
3959 coding_adjust_composition_offset (coding, pos)
3960 struct coding_system *coding;
3961 int pos;
3962 {
3963 struct composition_data *cmp_data;
3964
3965 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3966 cmp_data->char_offset = pos;
3967 }
3968
3969 /* Setup raw-text or one of its subsidiaries in the structure
3970 coding_system CODING according to the already setup value eol_type
3971 in CODING. CODING should be setup for some coding system in
3972 advance. */
3973
3974 void
3975 setup_raw_text_coding_system (coding)
3976 struct coding_system *coding;
3977 {
3978 if (coding->type != coding_type_raw_text)
3979 {
3980 coding->symbol = Qraw_text;
3981 coding->type = coding_type_raw_text;
3982 if (coding->eol_type != CODING_EOL_UNDECIDED)
3983 {
3984 Lisp_Object subsidiaries;
3985 subsidiaries = Fget (Qraw_text, Qeol_type);
3986
3987 if (VECTORP (subsidiaries)
3988 && XVECTOR (subsidiaries)->size == 3)
3989 coding->symbol
3990 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3991 }
3992 setup_coding_system (coding->symbol, coding);
3993 }
3994 return;
3995 }
3996
3997 /* Emacs has a mechanism to automatically detect a coding system if it
3998 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3999 it's impossible to distinguish some coding systems accurately
4000 because they use the same range of codes. So, at first, coding
4001 systems are categorized into 7, those are:
4002
4003 o coding-category-emacs-mule
4004
4005 The category for a coding system which has the same code range
4006 as Emacs' internal format. Assigned the coding-system (Lisp
4007 symbol) `emacs-mule' by default.
4008
4009 o coding-category-sjis
4010
4011 The category for a coding system which has the same code range
4012 as SJIS. Assigned the coding-system (Lisp
4013 symbol) `japanese-shift-jis' by default.
4014
4015 o coding-category-iso-7
4016
4017 The category for a coding system which has the same code range
4018 as ISO2022 of 7-bit environment. This doesn't use any locking
4019 shift and single shift functions. This can encode/decode all
4020 charsets. Assigned the coding-system (Lisp symbol)
4021 `iso-2022-7bit' by default.
4022
4023 o coding-category-iso-7-tight
4024
4025 Same as coding-category-iso-7 except that this can
4026 encode/decode only the specified charsets.
4027
4028 o coding-category-iso-8-1
4029
4030 The category for a coding system which has the same code range
4031 as ISO2022 of 8-bit environment and graphic plane 1 used only
4032 for DIMENSION1 charset. This doesn't use any locking shift
4033 and single shift functions. Assigned the coding-system (Lisp
4034 symbol) `iso-latin-1' by default.
4035
4036 o coding-category-iso-8-2
4037
4038 The category for a coding system which has the same code range
4039 as ISO2022 of 8-bit environment and graphic plane 1 used only
4040 for DIMENSION2 charset. This doesn't use any locking shift
4041 and single shift functions. Assigned the coding-system (Lisp
4042 symbol) `japanese-iso-8bit' by default.
4043
4044 o coding-category-iso-7-else
4045
4046 The category for a coding system which has the same code range
4047 as ISO2022 of 7-bit environment but uses locking shift or
4048 single shift functions. Assigned the coding-system (Lisp
4049 symbol) `iso-2022-7bit-lock' by default.
4050
4051 o coding-category-iso-8-else
4052
4053 The category for a coding system which has the same code range
4054 as ISO2022 of 8-bit environment but uses locking shift or
4055 single shift functions. Assigned the coding-system (Lisp
4056 symbol) `iso-2022-8bit-ss2' by default.
4057
4058 o coding-category-big5
4059
4060 The category for a coding system which has the same code range
4061 as BIG5. Assigned the coding-system (Lisp symbol)
4062 `cn-big5' by default.
4063
4064 o coding-category-utf-8
4065
4066 The category for a coding system which has the same code range
4067 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4068 symbol) `utf-8' by default.
4069
4070 o coding-category-utf-16-be
4071
4072 The category for a coding system in which a text has an
4073 Unicode signature (cf. Unicode Standard) in the order of BIG
4074 endian at the head. Assigned the coding-system (Lisp symbol)
4075 `utf-16-be' by default.
4076
4077 o coding-category-utf-16-le
4078
4079 The category for a coding system in which a text has an
4080 Unicode signature (cf. Unicode Standard) in the order of
4081 LITTLE endian at the head. Assigned the coding-system (Lisp
4082 symbol) `utf-16-le' by default.
4083
4084 o coding-category-ccl
4085
4086 The category for a coding system of which encoder/decoder is
4087 written in CCL programs. The default value is nil, i.e., no
4088 coding system is assigned.
4089
4090 o coding-category-binary
4091
4092 The category for a coding system not categorized in any of the
4093 above. Assigned the coding-system (Lisp symbol)
4094 `no-conversion' by default.
4095
4096 Each of them is a Lisp symbol and the value is an actual
4097 `coding-system' (this is also a Lisp symbol) assigned by a user.
4098 What Emacs does actually is to detect a category of coding system.
4099 Then, it uses a `coding-system' assigned to it. If Emacs can't
4100 decide a single possible category, it selects a category of the
4101 highest priority. Priorities of categories are also specified by a
4102 user in a Lisp variable `coding-category-list'.
4103
4104 */
4105
4106 static
4107 int ascii_skip_code[256];
4108
4109 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4110 If it detects possible coding systems, return an integer in which
4111 appropriate flag bits are set. Flag bits are defined by macros
4112 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4113 it should point the table `coding_priorities'. In that case, only
4114 the flag bit for a coding system of the highest priority is set in
4115 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4116 range 0x80..0x9F are in multibyte form.
4117
4118 How many ASCII characters are at the head is returned as *SKIP. */
4119
4120 static int
4121 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4122 unsigned char *source;
4123 int src_bytes, *priorities, *skip;
4124 int multibytep;
4125 {
4126 register unsigned char c;
4127 unsigned char *src = source, *src_end = source + src_bytes;
4128 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4129 int i;
4130
4131 /* At first, skip all ASCII characters and control characters except
4132 for three ISO2022 specific control characters. */
4133 ascii_skip_code[ISO_CODE_SO] = 0;
4134 ascii_skip_code[ISO_CODE_SI] = 0;
4135 ascii_skip_code[ISO_CODE_ESC] = 0;
4136
4137 label_loop_detect_coding:
4138 while (src < src_end && ascii_skip_code[*src]) src++;
4139 *skip = src - source;
4140
4141 if (src >= src_end)
4142 /* We found nothing other than ASCII. There's nothing to do. */
4143 return 0;
4144
4145 c = *src;
4146 /* The text seems to be encoded in some multilingual coding system.
4147 Now, try to find in which coding system the text is encoded. */
4148 if (c < 0x80)
4149 {
4150 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4151 /* C is an ISO2022 specific control code of C0. */
4152 mask = detect_coding_iso2022 (src, src_end, multibytep);
4153 if (mask == 0)
4154 {
4155 /* No valid ISO2022 code follows C. Try again. */
4156 src++;
4157 if (c == ISO_CODE_ESC)
4158 ascii_skip_code[ISO_CODE_ESC] = 1;
4159 else
4160 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4161 goto label_loop_detect_coding;
4162 }
4163 if (priorities)
4164 {
4165 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4166 {
4167 if (mask & priorities[i])
4168 return priorities[i];
4169 }
4170 return CODING_CATEGORY_MASK_RAW_TEXT;
4171 }
4172 }
4173 else
4174 {
4175 int try;
4176
4177 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4178 c = src[1] - 0x20;
4179
4180 if (c < 0xA0)
4181 {
4182 /* C is the first byte of SJIS character code,
4183 or a leading-code of Emacs' internal format (emacs-mule),
4184 or the first byte of UTF-16. */
4185 try = (CODING_CATEGORY_MASK_SJIS
4186 | CODING_CATEGORY_MASK_EMACS_MULE
4187 | CODING_CATEGORY_MASK_UTF_16_BE
4188 | CODING_CATEGORY_MASK_UTF_16_LE);
4189
4190 /* Or, if C is a special latin extra code,
4191 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4192 or is an ISO2022 control-sequence-introducer (CSI),
4193 we should also consider the possibility of ISO2022 codings. */
4194 if ((VECTORP (Vlatin_extra_code_table)
4195 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4196 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4197 || (c == ISO_CODE_CSI
4198 && (src < src_end
4199 && (*src == ']'
4200 || ((*src == '0' || *src == '1' || *src == '2')
4201 && src + 1 < src_end
4202 && src[1] == ']')))))
4203 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4204 | CODING_CATEGORY_MASK_ISO_8BIT);
4205 }
4206 else
4207 /* C is a character of ISO2022 in graphic plane right,
4208 or a SJIS's 1-byte character code (i.e. JISX0201),
4209 or the first byte of BIG5's 2-byte code,
4210 or the first byte of UTF-8/16. */
4211 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4212 | CODING_CATEGORY_MASK_ISO_8BIT
4213 | CODING_CATEGORY_MASK_SJIS
4214 | CODING_CATEGORY_MASK_BIG5
4215 | CODING_CATEGORY_MASK_UTF_8
4216 | CODING_CATEGORY_MASK_UTF_16_BE
4217 | CODING_CATEGORY_MASK_UTF_16_LE);
4218
4219 /* Or, we may have to consider the possibility of CCL. */
4220 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4221 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4222 ->spec.ccl.valid_codes)[c])
4223 try |= CODING_CATEGORY_MASK_CCL;
4224
4225 mask = 0;
4226 utf16_examined_p = iso2022_examined_p = 0;
4227 if (priorities)
4228 {
4229 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4230 {
4231 if (!iso2022_examined_p
4232 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4233 {
4234 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4235 iso2022_examined_p = 1;
4236 }
4237 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4238 mask |= detect_coding_sjis (src, src_end, multibytep);
4239 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4240 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4241 else if (!utf16_examined_p
4242 && (priorities[i] & try &
4243 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4244 {
4245 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4246 utf16_examined_p = 1;
4247 }
4248 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4249 mask |= detect_coding_big5 (src, src_end, multibytep);
4250 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4251 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4252 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4253 mask |= detect_coding_ccl (src, src_end, multibytep);
4254 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4255 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4256 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4257 mask |= CODING_CATEGORY_MASK_BINARY;
4258 if (mask & priorities[i])
4259 return priorities[i];
4260 }
4261 return CODING_CATEGORY_MASK_RAW_TEXT;
4262 }
4263 if (try & CODING_CATEGORY_MASK_ISO)
4264 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4265 if (try & CODING_CATEGORY_MASK_SJIS)
4266 mask |= detect_coding_sjis (src, src_end, multibytep);
4267 if (try & CODING_CATEGORY_MASK_BIG5)
4268 mask |= detect_coding_big5 (src, src_end, multibytep);
4269 if (try & CODING_CATEGORY_MASK_UTF_8)
4270 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4271 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4272 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4273 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4274 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4275 if (try & CODING_CATEGORY_MASK_CCL)
4276 mask |= detect_coding_ccl (src, src_end, multibytep);
4277 }
4278 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4279 }
4280
4281 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4282 The information of the detected coding system is set in CODING. */
4283
4284 void
4285 detect_coding (coding, src, src_bytes)
4286 struct coding_system *coding;
4287 const unsigned char *src;
4288 int src_bytes;
4289 {
4290 unsigned int idx;
4291 int skip, mask;
4292 Lisp_Object val;
4293
4294 val = Vcoding_category_list;
4295 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4296 coding->src_multibyte);
4297 coding->heading_ascii = skip;
4298
4299 if (!mask) return;
4300
4301 /* We found a single coding system of the highest priority in MASK. */
4302 idx = 0;
4303 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4304 if (! mask)
4305 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4306
4307 val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[idx]);
4308
4309 if (coding->eol_type != CODING_EOL_UNDECIDED)
4310 {
4311 Lisp_Object tmp;
4312
4313 tmp = Fget (val, Qeol_type);
4314 if (VECTORP (tmp))
4315 val = XVECTOR (tmp)->contents[coding->eol_type];
4316 }
4317
4318 /* Setup this new coding system while preserving some slots. */
4319 {
4320 int src_multibyte = coding->src_multibyte;
4321 int dst_multibyte = coding->dst_multibyte;
4322
4323 setup_coding_system (val, coding);
4324 coding->src_multibyte = src_multibyte;
4325 coding->dst_multibyte = dst_multibyte;
4326 coding->heading_ascii = skip;
4327 }
4328 }
4329
4330 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4331 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4332 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4333
4334 How many non-eol characters are at the head is returned as *SKIP. */
4335
4336 #define MAX_EOL_CHECK_COUNT 3
4337
4338 static int
4339 detect_eol_type (source, src_bytes, skip)
4340 const unsigned char *source;
4341 int src_bytes, *skip;
4342 {
4343 const unsigned char *src = source, *src_end = src + src_bytes;
4344 unsigned char c;
4345 int total = 0; /* How many end-of-lines are found so far. */
4346 int eol_type = CODING_EOL_UNDECIDED;
4347 int this_eol_type;
4348
4349 *skip = 0;
4350
4351 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4352 {
4353 c = *src++;
4354 if (c == '\n' || c == '\r')
4355 {
4356 if (*skip == 0)
4357 *skip = src - 1 - source;
4358 total++;
4359 if (c == '\n')
4360 this_eol_type = CODING_EOL_LF;
4361 else if (src >= src_end || *src != '\n')
4362 this_eol_type = CODING_EOL_CR;
4363 else
4364 this_eol_type = CODING_EOL_CRLF, src++;
4365
4366 if (eol_type == CODING_EOL_UNDECIDED)
4367 /* This is the first end-of-line. */
4368 eol_type = this_eol_type;
4369 else if (eol_type != this_eol_type)
4370 {
4371 /* The found type is different from what found before. */
4372 eol_type = CODING_EOL_INCONSISTENT;
4373 break;
4374 }
4375 }
4376 }
4377
4378 if (*skip == 0)
4379 *skip = src_end - source;
4380 return eol_type;
4381 }
4382
4383 /* Like detect_eol_type, but detect EOL type in 2-octet
4384 big-endian/little-endian format for coding systems utf-16-be and
4385 utf-16-le. */
4386
4387 static int
4388 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4389 const unsigned char *source;
4390 int src_bytes, *skip, big_endian_p;
4391 {
4392 const unsigned char *src = source, *src_end = src + src_bytes;
4393 unsigned int c1, c2;
4394 int total = 0; /* How many end-of-lines are found so far. */
4395 int eol_type = CODING_EOL_UNDECIDED;
4396 int this_eol_type;
4397 int msb, lsb;
4398
4399 if (big_endian_p)
4400 msb = 0, lsb = 1;
4401 else
4402 msb = 1, lsb = 0;
4403
4404 *skip = 0;
4405
4406 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4407 {
4408 c1 = (src[msb] << 8) | (src[lsb]);
4409 src += 2;
4410
4411 if (c1 == '\n' || c1 == '\r')
4412 {
4413 if (*skip == 0)
4414 *skip = src - 2 - source;
4415 total++;
4416 if (c1 == '\n')
4417 {
4418 this_eol_type = CODING_EOL_LF;
4419 }
4420 else
4421 {
4422 if ((src + 1) >= src_end)
4423 {
4424 this_eol_type = CODING_EOL_CR;
4425 }
4426 else
4427 {
4428 c2 = (src[msb] << 8) | (src[lsb]);
4429 if (c2 == '\n')
4430 this_eol_type = CODING_EOL_CRLF, src += 2;
4431 else
4432 this_eol_type = CODING_EOL_CR;
4433 }
4434 }
4435
4436 if (eol_type == CODING_EOL_UNDECIDED)
4437 /* This is the first end-of-line. */
4438 eol_type = this_eol_type;
4439 else if (eol_type != this_eol_type)
4440 {
4441 /* The found type is different from what found before. */
4442 eol_type = CODING_EOL_INCONSISTENT;
4443 break;
4444 }
4445 }
4446 }
4447
4448 if (*skip == 0)
4449 *skip = src_end - source;
4450 return eol_type;
4451 }
4452
4453 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4454 is encoded. If it detects an appropriate format of end-of-line, it
4455 sets the information in *CODING. */
4456
4457 void
4458 detect_eol (coding, src, src_bytes)
4459 struct coding_system *coding;
4460 const unsigned char *src;
4461 int src_bytes;
4462 {
4463 Lisp_Object val;
4464 int skip;
4465 int eol_type;
4466
4467 switch (coding->category_idx)
4468 {
4469 case CODING_CATEGORY_IDX_UTF_16_BE:
4470 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4471 break;
4472 case CODING_CATEGORY_IDX_UTF_16_LE:
4473 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4474 break;
4475 default:
4476 eol_type = detect_eol_type (src, src_bytes, &skip);
4477 break;
4478 }
4479
4480 if (coding->heading_ascii > skip)
4481 coding->heading_ascii = skip;
4482 else
4483 skip = coding->heading_ascii;
4484
4485 if (eol_type == CODING_EOL_UNDECIDED)
4486 return;
4487 if (eol_type == CODING_EOL_INCONSISTENT)
4488 {
4489 #if 0
4490 /* This code is suppressed until we find a better way to
4491 distinguish raw text file and binary file. */
4492
4493 /* If we have already detected that the coding is raw-text, the
4494 coding should actually be no-conversion. */
4495 if (coding->type == coding_type_raw_text)
4496 {
4497 setup_coding_system (Qno_conversion, coding);
4498 return;
4499 }
4500 /* Else, let's decode only text code anyway. */
4501 #endif /* 0 */
4502 eol_type = CODING_EOL_LF;
4503 }
4504
4505 val = Fget (coding->symbol, Qeol_type);
4506 if (VECTORP (val) && XVECTOR (val)->size == 3)
4507 {
4508 int src_multibyte = coding->src_multibyte;
4509 int dst_multibyte = coding->dst_multibyte;
4510 struct composition_data *cmp_data = coding->cmp_data;
4511
4512 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4513 coding->src_multibyte = src_multibyte;
4514 coding->dst_multibyte = dst_multibyte;
4515 coding->heading_ascii = skip;
4516 coding->cmp_data = cmp_data;
4517 }
4518 }
4519
4520 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4521
4522 #define DECODING_BUFFER_MAG(coding) \
4523 (coding->type == coding_type_iso2022 \
4524 ? 3 \
4525 : (coding->type == coding_type_ccl \
4526 ? coding->spec.ccl.decoder.buf_magnification \
4527 : 2))
4528
4529 /* Return maximum size (bytes) of a buffer enough for decoding
4530 SRC_BYTES of text encoded in CODING. */
4531
4532 int
4533 decoding_buffer_size (coding, src_bytes)
4534 struct coding_system *coding;
4535 int src_bytes;
4536 {
4537 return (src_bytes * DECODING_BUFFER_MAG (coding)
4538 + CONVERSION_BUFFER_EXTRA_ROOM);
4539 }
4540
4541 /* Return maximum size (bytes) of a buffer enough for encoding
4542 SRC_BYTES of text to CODING. */
4543
4544 int
4545 encoding_buffer_size (coding, src_bytes)
4546 struct coding_system *coding;
4547 int src_bytes;
4548 {
4549 int magnification;
4550
4551 if (coding->type == coding_type_ccl)
4552 {
4553 magnification = coding->spec.ccl.encoder.buf_magnification;
4554 if (coding->eol_type == CODING_EOL_CRLF)
4555 magnification *= 2;
4556 }
4557 else if (CODING_REQUIRE_ENCODING (coding))
4558 magnification = 3;
4559 else
4560 magnification = 1;
4561
4562 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4563 }
4564
4565 /* Working buffer for code conversion. */
4566 struct conversion_buffer
4567 {
4568 int size; /* size of data. */
4569 int on_stack; /* 1 if allocated by alloca. */
4570 unsigned char *data;
4571 };
4572
4573 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4574 #define allocate_conversion_buffer(buf, len) \
4575 do { \
4576 if (len < MAX_ALLOCA) \
4577 { \
4578 buf.data = (unsigned char *) alloca (len); \
4579 buf.on_stack = 1; \
4580 } \
4581 else \
4582 { \
4583 buf.data = (unsigned char *) xmalloc (len); \
4584 buf.on_stack = 0; \
4585 } \
4586 buf.size = len; \
4587 } while (0)
4588
4589 /* Double the allocated memory for *BUF. */
4590 static void
4591 extend_conversion_buffer (buf)
4592 struct conversion_buffer *buf;
4593 {
4594 if (buf->on_stack)
4595 {
4596 unsigned char *save = buf->data;
4597 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4598 bcopy (save, buf->data, buf->size);
4599 buf->on_stack = 0;
4600 }
4601 else
4602 {
4603 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4604 }
4605 buf->size *= 2;
4606 }
4607
4608 /* Free the allocated memory for BUF if it is not on stack. */
4609 static void
4610 free_conversion_buffer (buf)
4611 struct conversion_buffer *buf;
4612 {
4613 if (!buf->on_stack)
4614 xfree (buf->data);
4615 }
4616
4617 int
4618 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4619 struct coding_system *coding;
4620 unsigned char *source, *destination;
4621 int src_bytes, dst_bytes, encodep;
4622 {
4623 struct ccl_program *ccl
4624 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4625 unsigned char *dst = destination;
4626
4627 ccl->suppress_error = coding->suppress_error;
4628 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4629 if (encodep)
4630 {
4631 /* On encoding, EOL format is converted within ccl_driver. For
4632 that, setup proper information in the structure CCL. */
4633 ccl->eol_type = coding->eol_type;
4634 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4635 ccl->eol_type = CODING_EOL_LF;
4636 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4637 ccl->eight_bit_control = coding->dst_multibyte;
4638 }
4639 else
4640 ccl->eight_bit_control = 1;
4641 ccl->multibyte = coding->src_multibyte;
4642 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4643 {
4644 /* Move carryover bytes to DESTINATION. */
4645 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4646 while (*p)
4647 *dst++ = *p++;
4648 coding->spec.ccl.eight_bit_carryover[0] = 0;
4649 if (dst_bytes)
4650 dst_bytes -= dst - destination;
4651 }
4652
4653 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4654 &(coding->consumed))
4655 + dst - destination);
4656
4657 if (encodep)
4658 {
4659 coding->produced_char = coding->produced;
4660 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4661 }
4662 else if (!ccl->eight_bit_control)
4663 {
4664 /* The produced bytes forms a valid multibyte sequence. */
4665 coding->produced_char
4666 = multibyte_chars_in_text (destination, coding->produced);
4667 coding->spec.ccl.eight_bit_carryover[0] = 0;
4668 }
4669 else
4670 {
4671 /* On decoding, the destination should always multibyte. But,
4672 CCL program might have been generated an invalid multibyte
4673 sequence. Here we make such a sequence valid as
4674 multibyte. */
4675 int bytes
4676 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4677
4678 if ((coding->consumed < src_bytes
4679 || !ccl->last_block)
4680 && coding->produced >= 1
4681 && destination[coding->produced - 1] >= 0x80)
4682 {
4683 /* We should not convert the tailing 8-bit codes to
4684 multibyte form even if they doesn't form a valid
4685 multibyte sequence. They may form a valid sequence in
4686 the next call. */
4687 int carryover = 0;
4688
4689 if (destination[coding->produced - 1] < 0xA0)
4690 carryover = 1;
4691 else if (coding->produced >= 2)
4692 {
4693 if (destination[coding->produced - 2] >= 0x80)
4694 {
4695 if (destination[coding->produced - 2] < 0xA0)
4696 carryover = 2;
4697 else if (coding->produced >= 3
4698 && destination[coding->produced - 3] >= 0x80
4699 && destination[coding->produced - 3] < 0xA0)
4700 carryover = 3;
4701 }
4702 }
4703 if (carryover > 0)
4704 {
4705 BCOPY_SHORT (destination + coding->produced - carryover,
4706 coding->spec.ccl.eight_bit_carryover,
4707 carryover);
4708 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4709 coding->produced -= carryover;
4710 }
4711 }
4712 coding->produced = str_as_multibyte (destination, bytes,
4713 coding->produced,
4714 &(coding->produced_char));
4715 }
4716
4717 switch (ccl->status)
4718 {
4719 case CCL_STAT_SUSPEND_BY_SRC:
4720 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4721 break;
4722 case CCL_STAT_SUSPEND_BY_DST:
4723 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4724 break;
4725 case CCL_STAT_QUIT:
4726 case CCL_STAT_INVALID_CMD:
4727 coding->result = CODING_FINISH_INTERRUPT;
4728 break;
4729 default:
4730 coding->result = CODING_FINISH_NORMAL;
4731 break;
4732 }
4733 return coding->result;
4734 }
4735
4736 /* Decode EOL format of the text at PTR of BYTES length destructively
4737 according to CODING->eol_type. This is called after the CCL
4738 program produced a decoded text at PTR. If we do CRLF->LF
4739 conversion, update CODING->produced and CODING->produced_char. */
4740
4741 static void
4742 decode_eol_post_ccl (coding, ptr, bytes)
4743 struct coding_system *coding;
4744 unsigned char *ptr;
4745 int bytes;
4746 {
4747 Lisp_Object val, saved_coding_symbol;
4748 unsigned char *pend = ptr + bytes;
4749 int dummy;
4750
4751 /* Remember the current coding system symbol. We set it back when
4752 an inconsistent EOL is found so that `last-coding-system-used' is
4753 set to the coding system that doesn't specify EOL conversion. */
4754 saved_coding_symbol = coding->symbol;
4755
4756 coding->spec.ccl.cr_carryover = 0;
4757 if (coding->eol_type == CODING_EOL_UNDECIDED)
4758 {
4759 /* Here, to avoid the call of setup_coding_system, we directly
4760 call detect_eol_type. */
4761 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4762 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4763 coding->eol_type = CODING_EOL_LF;
4764 if (coding->eol_type != CODING_EOL_UNDECIDED)
4765 {
4766 val = Fget (coding->symbol, Qeol_type);
4767 if (VECTORP (val) && XVECTOR (val)->size == 3)
4768 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4769 }
4770 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4771 }
4772
4773 if (coding->eol_type == CODING_EOL_LF
4774 || coding->eol_type == CODING_EOL_UNDECIDED)
4775 {
4776 /* We have nothing to do. */
4777 ptr = pend;
4778 }
4779 else if (coding->eol_type == CODING_EOL_CRLF)
4780 {
4781 unsigned char *pstart = ptr, *p = ptr;
4782
4783 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4784 && *(pend - 1) == '\r')
4785 {
4786 /* If the last character is CR, we can't handle it here
4787 because LF will be in the not-yet-decoded source text.
4788 Record that the CR is not yet processed. */
4789 coding->spec.ccl.cr_carryover = 1;
4790 coding->produced--;
4791 coding->produced_char--;
4792 pend--;
4793 }
4794 while (ptr < pend)
4795 {
4796 if (*ptr == '\r')
4797 {
4798 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4799 {
4800 *p++ = '\n';
4801 ptr += 2;
4802 }
4803 else
4804 {
4805 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4806 goto undo_eol_conversion;
4807 *p++ = *ptr++;
4808 }
4809 }
4810 else if (*ptr == '\n'
4811 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4812 goto undo_eol_conversion;
4813 else
4814 *p++ = *ptr++;
4815 continue;
4816
4817 undo_eol_conversion:
4818 /* We have faced with inconsistent EOL format at PTR.
4819 Convert all LFs before PTR back to CRLFs. */
4820 for (p--, ptr--; p >= pstart; p--)
4821 {
4822 if (*p == '\n')
4823 *ptr-- = '\n', *ptr-- = '\r';
4824 else
4825 *ptr-- = *p;
4826 }
4827 /* If carryover is recorded, cancel it because we don't
4828 convert CRLF anymore. */
4829 if (coding->spec.ccl.cr_carryover)
4830 {
4831 coding->spec.ccl.cr_carryover = 0;
4832 coding->produced++;
4833 coding->produced_char++;
4834 pend++;
4835 }
4836 p = ptr = pend;
4837 coding->eol_type = CODING_EOL_LF;
4838 coding->symbol = saved_coding_symbol;
4839 }
4840 if (p < pend)
4841 {
4842 /* As each two-byte sequence CRLF was converted to LF, (PEND
4843 - P) is the number of deleted characters. */
4844 coding->produced -= pend - p;
4845 coding->produced_char -= pend - p;
4846 }
4847 }
4848 else /* i.e. coding->eol_type == CODING_EOL_CR */
4849 {
4850 unsigned char *p = ptr;
4851
4852 for (; ptr < pend; ptr++)
4853 {
4854 if (*ptr == '\r')
4855 *ptr = '\n';
4856 else if (*ptr == '\n'
4857 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4858 {
4859 for (; p < ptr; p++)
4860 {
4861 if (*p == '\n')
4862 *p = '\r';
4863 }
4864 ptr = pend;
4865 coding->eol_type = CODING_EOL_LF;
4866 coding->symbol = saved_coding_symbol;
4867 }
4868 }
4869 }
4870 }
4871
4872 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4873 decoding, it may detect coding system and format of end-of-line if
4874 those are not yet decided. The source should be unibyte, the
4875 result is multibyte if CODING->dst_multibyte is nonzero, else
4876 unibyte. */
4877
4878 int
4879 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4880 struct coding_system *coding;
4881 const unsigned char *source;
4882 unsigned char *destination;
4883 int src_bytes, dst_bytes;
4884 {
4885 int extra = 0;
4886
4887 if (coding->type == coding_type_undecided)
4888 detect_coding (coding, source, src_bytes);
4889
4890 if (coding->eol_type == CODING_EOL_UNDECIDED
4891 && coding->type != coding_type_ccl)
4892 {
4893 detect_eol (coding, source, src_bytes);
4894 /* We had better recover the original eol format if we
4895 encounter an inconsistent eol format while decoding. */
4896 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4897 }
4898
4899 coding->produced = coding->produced_char = 0;
4900 coding->consumed = coding->consumed_char = 0;
4901 coding->errors = 0;
4902 coding->result = CODING_FINISH_NORMAL;
4903
4904 switch (coding->type)
4905 {
4906 case coding_type_sjis:
4907 decode_coding_sjis_big5 (coding, source, destination,
4908 src_bytes, dst_bytes, 1);
4909 break;
4910
4911 case coding_type_iso2022:
4912 decode_coding_iso2022 (coding, source, destination,
4913 src_bytes, dst_bytes);
4914 break;
4915
4916 case coding_type_big5:
4917 decode_coding_sjis_big5 (coding, source, destination,
4918 src_bytes, dst_bytes, 0);
4919 break;
4920
4921 case coding_type_emacs_mule:
4922 decode_coding_emacs_mule (coding, source, destination,
4923 src_bytes, dst_bytes);
4924 break;
4925
4926 case coding_type_ccl:
4927 if (coding->spec.ccl.cr_carryover)
4928 {
4929 /* Put the CR which was not processed by the previous call
4930 of decode_eol_post_ccl in DESTINATION. It will be
4931 decoded together with the following LF by the call to
4932 decode_eol_post_ccl below. */
4933 *destination = '\r';
4934 coding->produced++;
4935 coding->produced_char++;
4936 dst_bytes--;
4937 extra = coding->spec.ccl.cr_carryover;
4938 }
4939 ccl_coding_driver (coding, source, destination + extra,
4940 src_bytes, dst_bytes, 0);
4941 if (coding->eol_type != CODING_EOL_LF)
4942 {
4943 coding->produced += extra;
4944 coding->produced_char += extra;
4945 decode_eol_post_ccl (coding, destination, coding->produced);
4946 }
4947 break;
4948
4949 default:
4950 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4951 }
4952
4953 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4954 && coding->mode & CODING_MODE_LAST_BLOCK
4955 && coding->consumed == src_bytes)
4956 coding->result = CODING_FINISH_NORMAL;
4957
4958 if (coding->mode & CODING_MODE_LAST_BLOCK
4959 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4960 {
4961 const unsigned char *src = source + coding->consumed;
4962 unsigned char *dst = destination + coding->produced;
4963
4964 src_bytes -= coding->consumed;
4965 coding->errors++;
4966 if (COMPOSING_P (coding))
4967 DECODE_COMPOSITION_END ('1');
4968 while (src_bytes--)
4969 {
4970 int c = *src++;
4971 dst += CHAR_STRING (c, dst);
4972 coding->produced_char++;
4973 }
4974 coding->consumed = coding->consumed_char = src - source;
4975 coding->produced = dst - destination;
4976 coding->result = CODING_FINISH_NORMAL;
4977 }
4978
4979 if (!coding->dst_multibyte)
4980 {
4981 coding->produced = str_as_unibyte (destination, coding->produced);
4982 coding->produced_char = coding->produced;
4983 }
4984
4985 return coding->result;
4986 }
4987
4988 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4989 multibyteness of the source is CODING->src_multibyte, the
4990 multibyteness of the result is always unibyte. */
4991
4992 int
4993 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4994 struct coding_system *coding;
4995 const unsigned char *source;
4996 unsigned char *destination;
4997 int src_bytes, dst_bytes;
4998 {
4999 coding->produced = coding->produced_char = 0;
5000 coding->consumed = coding->consumed_char = 0;
5001 coding->errors = 0;
5002 coding->result = CODING_FINISH_NORMAL;
5003 if (coding->eol_type == CODING_EOL_UNDECIDED)
5004 coding->eol_type = CODING_EOL_LF;
5005
5006 switch (coding->type)
5007 {
5008 case coding_type_sjis:
5009 encode_coding_sjis_big5 (coding, source, destination,
5010 src_bytes, dst_bytes, 1);
5011 break;
5012
5013 case coding_type_iso2022:
5014 encode_coding_iso2022 (coding, source, destination,
5015 src_bytes, dst_bytes);
5016 break;
5017
5018 case coding_type_big5:
5019 encode_coding_sjis_big5 (coding, source, destination,
5020 src_bytes, dst_bytes, 0);
5021 break;
5022
5023 case coding_type_emacs_mule:
5024 encode_coding_emacs_mule (coding, source, destination,
5025 src_bytes, dst_bytes);
5026 break;
5027
5028 case coding_type_ccl:
5029 ccl_coding_driver (coding, source, destination,
5030 src_bytes, dst_bytes, 1);
5031 break;
5032
5033 default:
5034 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5035 }
5036
5037 if (coding->mode & CODING_MODE_LAST_BLOCK
5038 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5039 {
5040 const unsigned char *src = source + coding->consumed;
5041 unsigned char *dst = destination + coding->produced;
5042
5043 if (coding->type == coding_type_iso2022)
5044 ENCODE_RESET_PLANE_AND_REGISTER;
5045 if (COMPOSING_P (coding))
5046 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5047 if (coding->consumed < src_bytes)
5048 {
5049 int len = src_bytes - coding->consumed;
5050
5051 BCOPY_SHORT (src, dst, len);
5052 if (coding->src_multibyte)
5053 len = str_as_unibyte (dst, len);
5054 dst += len;
5055 coding->consumed = src_bytes;
5056 }
5057 coding->produced = coding->produced_char = dst - destination;
5058 coding->result = CODING_FINISH_NORMAL;
5059 }
5060
5061 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5062 && coding->consumed == src_bytes)
5063 coding->result = CODING_FINISH_NORMAL;
5064
5065 return coding->result;
5066 }
5067
5068 /* Scan text in the region between *BEG and *END (byte positions),
5069 skip characters which we don't have to decode by coding system
5070 CODING at the head and tail, then set *BEG and *END to the region
5071 of the text we actually have to convert. The caller should move
5072 the gap out of the region in advance if the region is from a
5073 buffer.
5074
5075 If STR is not NULL, *BEG and *END are indices into STR. */
5076
5077 static void
5078 shrink_decoding_region (beg, end, coding, str)
5079 int *beg, *end;
5080 struct coding_system *coding;
5081 unsigned char *str;
5082 {
5083 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5084 int eol_conversion;
5085 Lisp_Object translation_table;
5086
5087 if (coding->type == coding_type_ccl
5088 || coding->type == coding_type_undecided
5089 || coding->eol_type != CODING_EOL_LF
5090 || !NILP (coding->post_read_conversion)
5091 || coding->composing != COMPOSITION_DISABLED)
5092 {
5093 /* We can't skip any data. */
5094 return;
5095 }
5096 if (coding->type == coding_type_no_conversion
5097 || coding->type == coding_type_raw_text
5098 || coding->type == coding_type_emacs_mule)
5099 {
5100 /* We need no conversion, but don't have to skip any data here.
5101 Decoding routine handles them effectively anyway. */
5102 return;
5103 }
5104
5105 translation_table = coding->translation_table_for_decode;
5106 if (NILP (translation_table) && !NILP (Venable_character_translation))
5107 translation_table = Vstandard_translation_table_for_decode;
5108 if (CHAR_TABLE_P (translation_table))
5109 {
5110 int i;
5111 for (i = 0; i < 128; i++)
5112 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5113 break;
5114 if (i < 128)
5115 /* Some ASCII character should be translated. We give up
5116 shrinking. */
5117 return;
5118 }
5119
5120 if (coding->heading_ascii >= 0)
5121 /* Detection routine has already found how much we can skip at the
5122 head. */
5123 *beg += coding->heading_ascii;
5124
5125 if (str)
5126 {
5127 begp_orig = begp = str + *beg;
5128 endp_orig = endp = str + *end;
5129 }
5130 else
5131 {
5132 begp_orig = begp = BYTE_POS_ADDR (*beg);
5133 endp_orig = endp = begp + *end - *beg;
5134 }
5135
5136 eol_conversion = (coding->eol_type == CODING_EOL_CR
5137 || coding->eol_type == CODING_EOL_CRLF);
5138
5139 switch (coding->type)
5140 {
5141 case coding_type_sjis:
5142 case coding_type_big5:
5143 /* We can skip all ASCII characters at the head. */
5144 if (coding->heading_ascii < 0)
5145 {
5146 if (eol_conversion)
5147 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5148 else
5149 while (begp < endp && *begp < 0x80) begp++;
5150 }
5151 /* We can skip all ASCII characters at the tail except for the
5152 second byte of SJIS or BIG5 code. */
5153 if (eol_conversion)
5154 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5155 else
5156 while (begp < endp && endp[-1] < 0x80) endp--;
5157 /* Do not consider LF as ascii if preceded by CR, since that
5158 confuses eol decoding. */
5159 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5160 endp++;
5161 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5162 endp++;
5163 break;
5164
5165 case coding_type_iso2022:
5166 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5167 /* We can't skip any data. */
5168 break;
5169 if (coding->heading_ascii < 0)
5170 {
5171 /* We can skip all ASCII characters at the head except for a
5172 few control codes. */
5173 while (begp < endp && (c = *begp) < 0x80
5174 && c != ISO_CODE_CR && c != ISO_CODE_SO
5175 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5176 && (!eol_conversion || c != ISO_CODE_LF))
5177 begp++;
5178 }
5179 switch (coding->category_idx)
5180 {
5181 case CODING_CATEGORY_IDX_ISO_8_1:
5182 case CODING_CATEGORY_IDX_ISO_8_2:
5183 /* We can skip all ASCII characters at the tail. */
5184 if (eol_conversion)
5185 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5186 else
5187 while (begp < endp && endp[-1] < 0x80) endp--;
5188 /* Do not consider LF as ascii if preceded by CR, since that
5189 confuses eol decoding. */
5190 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5191 endp++;
5192 break;
5193
5194 case CODING_CATEGORY_IDX_ISO_7:
5195 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5196 {
5197 /* We can skip all characters at the tail except for 8-bit
5198 codes and ESC and the following 2-byte at the tail. */
5199 unsigned char *eight_bit = NULL;
5200
5201 if (eol_conversion)
5202 while (begp < endp
5203 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5204 {
5205 if (!eight_bit && c & 0x80) eight_bit = endp;
5206 endp--;
5207 }
5208 else
5209 while (begp < endp
5210 && (c = endp[-1]) != ISO_CODE_ESC)
5211 {
5212 if (!eight_bit && c & 0x80) eight_bit = endp;
5213 endp--;
5214 }
5215 /* Do not consider LF as ascii if preceded by CR, since that
5216 confuses eol decoding. */
5217 if (begp < endp && endp < endp_orig
5218 && endp[-1] == '\r' && endp[0] == '\n')
5219 endp++;
5220 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5221 {
5222 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5223 /* This is an ASCII designation sequence. We can
5224 surely skip the tail. But, if we have
5225 encountered an 8-bit code, skip only the codes
5226 after that. */
5227 endp = eight_bit ? eight_bit : endp + 2;
5228 else
5229 /* Hmmm, we can't skip the tail. */
5230 endp = endp_orig;
5231 }
5232 else if (eight_bit)
5233 endp = eight_bit;
5234 }
5235 }
5236 break;
5237
5238 default:
5239 abort ();
5240 }
5241 *beg += begp - begp_orig;
5242 *end += endp - endp_orig;
5243 return;
5244 }
5245
5246 /* Like shrink_decoding_region but for encoding. */
5247
5248 static void
5249 shrink_encoding_region (beg, end, coding, str)
5250 int *beg, *end;
5251 struct coding_system *coding;
5252 unsigned char *str;
5253 {
5254 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5255 int eol_conversion;
5256 Lisp_Object translation_table;
5257
5258 if (coding->type == coding_type_ccl
5259 || coding->eol_type == CODING_EOL_CRLF
5260 || coding->eol_type == CODING_EOL_CR
5261 || (coding->cmp_data && coding->cmp_data->used > 0))
5262 {
5263 /* We can't skip any data. */
5264 return;
5265 }
5266 if (coding->type == coding_type_no_conversion
5267 || coding->type == coding_type_raw_text
5268 || coding->type == coding_type_emacs_mule
5269 || coding->type == coding_type_undecided)
5270 {
5271 /* We need no conversion, but don't have to skip any data here.
5272 Encoding routine handles them effectively anyway. */
5273 return;
5274 }
5275
5276 translation_table = coding->translation_table_for_encode;
5277 if (NILP (translation_table) && !NILP (Venable_character_translation))
5278 translation_table = Vstandard_translation_table_for_encode;
5279 if (CHAR_TABLE_P (translation_table))
5280 {
5281 int i;
5282 for (i = 0; i < 128; i++)
5283 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5284 break;
5285 if (i < 128)
5286 /* Some ASCII character should be translated. We give up
5287 shrinking. */
5288 return;
5289 }
5290
5291 if (str)
5292 {
5293 begp_orig = begp = str + *beg;
5294 endp_orig = endp = str + *end;
5295 }
5296 else
5297 {
5298 begp_orig = begp = BYTE_POS_ADDR (*beg);
5299 endp_orig = endp = begp + *end - *beg;
5300 }
5301
5302 eol_conversion = (coding->eol_type == CODING_EOL_CR
5303 || coding->eol_type == CODING_EOL_CRLF);
5304
5305 /* Here, we don't have to check coding->pre_write_conversion because
5306 the caller is expected to have handled it already. */
5307 switch (coding->type)
5308 {
5309 case coding_type_iso2022:
5310 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5311 /* We can't skip any data. */
5312 break;
5313 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5314 {
5315 unsigned char *bol = begp;
5316 while (begp < endp && *begp < 0x80)
5317 {
5318 begp++;
5319 if (begp[-1] == '\n')
5320 bol = begp;
5321 }
5322 begp = bol;
5323 goto label_skip_tail;
5324 }
5325 /* fall down ... */
5326
5327 case coding_type_sjis:
5328 case coding_type_big5:
5329 /* We can skip all ASCII characters at the head and tail. */
5330 if (eol_conversion)
5331 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5332 else
5333 while (begp < endp && *begp < 0x80) begp++;
5334 label_skip_tail:
5335 if (eol_conversion)
5336 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5337 else
5338 while (begp < endp && *(endp - 1) < 0x80) endp--;
5339 break;
5340
5341 default:
5342 abort ();
5343 }
5344
5345 *beg += begp - begp_orig;
5346 *end += endp - endp_orig;
5347 return;
5348 }
5349
5350 /* As shrinking conversion region requires some overhead, we don't try
5351 shrinking if the length of conversion region is less than this
5352 value. */
5353 static int shrink_conversion_region_threshhold = 1024;
5354
5355 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5356 do { \
5357 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5358 { \
5359 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5360 else shrink_decoding_region (beg, end, coding, str); \
5361 } \
5362 } while (0)
5363
5364 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5365 Vlast_coding_system_used and the remaining elements are buffers to
5366 kill. */
5367 static Lisp_Object
5368 code_convert_region_unwind (arg)
5369 Lisp_Object arg;
5370 {
5371 struct gcpro gcpro1;
5372 GCPRO1 (arg);
5373
5374 inhibit_pre_post_conversion = 0;
5375 Vlast_coding_system_used = XCAR (arg);
5376 for (arg = XCDR (arg); CONSP (arg); arg = XCDR (arg))
5377 Fkill_buffer (XCAR (arg));
5378
5379 UNGCPRO;
5380 return Qnil;
5381 }
5382
5383 /* Store information about all compositions in the range FROM and TO
5384 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5385 buffer or a string, defaults to the current buffer. */
5386
5387 void
5388 coding_save_composition (coding, from, to, obj)
5389 struct coding_system *coding;
5390 int from, to;
5391 Lisp_Object obj;
5392 {
5393 Lisp_Object prop;
5394 int start, end;
5395
5396 if (coding->composing == COMPOSITION_DISABLED)
5397 return;
5398 if (!coding->cmp_data)
5399 coding_allocate_composition_data (coding, from);
5400 if (!find_composition (from, to, &start, &end, &prop, obj)
5401 || end > to)
5402 return;
5403 if (start < from
5404 && (!find_composition (end, to, &start, &end, &prop, obj)
5405 || end > to))
5406 return;
5407 coding->composing = COMPOSITION_NO;
5408 do
5409 {
5410 if (COMPOSITION_VALID_P (start, end, prop))
5411 {
5412 enum composition_method method = COMPOSITION_METHOD (prop);
5413 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5414 >= COMPOSITION_DATA_SIZE)
5415 coding_allocate_composition_data (coding, from);
5416 /* For relative composition, we remember start and end
5417 positions, for the other compositions, we also remember
5418 components. */
5419 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5420 if (method != COMPOSITION_RELATIVE)
5421 {
5422 /* We must store a*/
5423 Lisp_Object val, ch;
5424
5425 val = COMPOSITION_COMPONENTS (prop);
5426 if (CONSP (val))
5427 while (CONSP (val))
5428 {
5429 ch = XCAR (val), val = XCDR (val);
5430 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5431 }
5432 else if (VECTORP (val) || STRINGP (val))
5433 {
5434 int len = (VECTORP (val)
5435 ? XVECTOR (val)->size : SCHARS (val));
5436 int i;
5437 for (i = 0; i < len; i++)
5438 {
5439 ch = (STRINGP (val)
5440 ? Faref (val, make_number (i))
5441 : XVECTOR (val)->contents[i]);
5442 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5443 }
5444 }
5445 else /* INTEGERP (val) */
5446 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5447 }
5448 CODING_ADD_COMPOSITION_END (coding, end - from);
5449 }
5450 start = end;
5451 }
5452 while (start < to
5453 && find_composition (start, to, &start, &end, &prop, obj)
5454 && end <= to);
5455
5456 /* Make coding->cmp_data point to the first memory block. */
5457 while (coding->cmp_data->prev)
5458 coding->cmp_data = coding->cmp_data->prev;
5459 coding->cmp_data_start = 0;
5460 }
5461
5462 /* Reflect the saved information about compositions to OBJ.
5463 CODING->cmp_data points to a memory block for the information. OBJ
5464 is a buffer or a string, defaults to the current buffer. */
5465
5466 void
5467 coding_restore_composition (coding, obj)
5468 struct coding_system *coding;
5469 Lisp_Object obj;
5470 {
5471 struct composition_data *cmp_data = coding->cmp_data;
5472
5473 if (!cmp_data)
5474 return;
5475
5476 while (cmp_data->prev)
5477 cmp_data = cmp_data->prev;
5478
5479 while (cmp_data)
5480 {
5481 int i;
5482
5483 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5484 i += cmp_data->data[i])
5485 {
5486 int *data = cmp_data->data + i;
5487 enum composition_method method = (enum composition_method) data[3];
5488 Lisp_Object components;
5489
5490 if (data[0] < 0 || i + data[0] > cmp_data->used)
5491 /* Invalid composition data. */
5492 break;
5493
5494 if (method == COMPOSITION_RELATIVE)
5495 components = Qnil;
5496 else
5497 {
5498 int len = data[0] - 4, j;
5499 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5500
5501 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5502 && len % 2 == 0)
5503 len --;
5504 if (len < 1)
5505 /* Invalid composition data. */
5506 break;
5507 for (j = 0; j < len; j++)
5508 args[j] = make_number (data[4 + j]);
5509 components = (method == COMPOSITION_WITH_ALTCHARS
5510 ? Fstring (len, args)
5511 : Fvector (len, args));
5512 }
5513 compose_text (data[1], data[2], components, Qnil, obj);
5514 }
5515 cmp_data = cmp_data->next;
5516 }
5517 }
5518
5519 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5520 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5521 coding system CODING, and return the status code of code conversion
5522 (currently, this value has no meaning).
5523
5524 How many characters (and bytes) are converted to how many
5525 characters (and bytes) are recorded in members of the structure
5526 CODING.
5527
5528 If REPLACE is nonzero, we do various things as if the original text
5529 is deleted and a new text is inserted. See the comments in
5530 replace_range (insdel.c) to know what we are doing.
5531
5532 If REPLACE is zero, it is assumed that the source text is unibyte.
5533 Otherwise, it is assumed that the source text is multibyte. */
5534
5535 int
5536 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5537 int from, from_byte, to, to_byte, encodep, replace;
5538 struct coding_system *coding;
5539 {
5540 int len = to - from, len_byte = to_byte - from_byte;
5541 int nchars_del = 0, nbytes_del = 0;
5542 int require, inserted, inserted_byte;
5543 int head_skip, tail_skip, total_skip = 0;
5544 Lisp_Object saved_coding_symbol;
5545 int first = 1;
5546 unsigned char *src, *dst;
5547 Lisp_Object deletion;
5548 int orig_point = PT, orig_len = len;
5549 int prev_Z;
5550 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5551
5552 deletion = Qnil;
5553 saved_coding_symbol = coding->symbol;
5554
5555 if (from < PT && PT < to)
5556 {
5557 TEMP_SET_PT_BOTH (from, from_byte);
5558 orig_point = from;
5559 }
5560
5561 if (replace)
5562 {
5563 int saved_from = from;
5564 int saved_inhibit_modification_hooks;
5565
5566 prepare_to_modify_buffer (from, to, &from);
5567 if (saved_from != from)
5568 {
5569 to = from + len;
5570 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5571 len_byte = to_byte - from_byte;
5572 }
5573
5574 /* The code conversion routine can not preserve text properties
5575 for now. So, we must remove all text properties in the
5576 region. Here, we must suppress all modification hooks. */
5577 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5578 inhibit_modification_hooks = 1;
5579 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5580 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5581 }
5582
5583 coding->heading_ascii = 0;
5584
5585 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5586 {
5587 /* We must detect encoding of text and eol format. */
5588
5589 if (from < GPT && to > GPT)
5590 move_gap_both (from, from_byte);
5591 if (coding->type == coding_type_undecided)
5592 {
5593 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5594 if (coding->type == coding_type_undecided)
5595 {
5596 /* It seems that the text contains only ASCII, but we
5597 should not leave it undecided because the deeper
5598 decoding routine (decode_coding) tries to detect the
5599 encodings again in vain. */
5600 coding->type = coding_type_emacs_mule;
5601 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5602 /* As emacs-mule decoder will handle composition, we
5603 need this setting to allocate coding->cmp_data
5604 later. */
5605 coding->composing = COMPOSITION_NO;
5606 }
5607 }
5608 if (coding->eol_type == CODING_EOL_UNDECIDED
5609 && coding->type != coding_type_ccl)
5610 {
5611 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5612 if (coding->eol_type == CODING_EOL_UNDECIDED)
5613 coding->eol_type = CODING_EOL_LF;
5614 /* We had better recover the original eol format if we
5615 encounter an inconsistent eol format while decoding. */
5616 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5617 }
5618 }
5619
5620 /* Now we convert the text. */
5621
5622 /* For encoding, we must process pre-write-conversion in advance. */
5623 if (! inhibit_pre_post_conversion
5624 && encodep
5625 && SYMBOLP (coding->pre_write_conversion)
5626 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5627 {
5628 /* The function in pre-write-conversion may put a new text in a
5629 new buffer. */
5630 struct buffer *prev = current_buffer;
5631 Lisp_Object new;
5632
5633 record_unwind_protect (code_convert_region_unwind,
5634 Fcons (Vlast_coding_system_used, Qnil));
5635 /* We should not call any more pre-write/post-read-conversion
5636 functions while this pre-write-conversion is running. */
5637 inhibit_pre_post_conversion = 1;
5638 call2 (coding->pre_write_conversion,
5639 make_number (from), make_number (to));
5640 inhibit_pre_post_conversion = 0;
5641 /* Discard the unwind protect. */
5642 specpdl_ptr--;
5643
5644 if (current_buffer != prev)
5645 {
5646 len = ZV - BEGV;
5647 new = Fcurrent_buffer ();
5648 set_buffer_internal_1 (prev);
5649 del_range_2 (from, from_byte, to, to_byte, 0);
5650 TEMP_SET_PT_BOTH (from, from_byte);
5651 insert_from_buffer (XBUFFER (new), 1, len, 0);
5652 Fkill_buffer (new);
5653 if (orig_point >= to)
5654 orig_point += len - orig_len;
5655 else if (orig_point > from)
5656 orig_point = from;
5657 orig_len = len;
5658 to = from + len;
5659 from_byte = CHAR_TO_BYTE (from);
5660 to_byte = CHAR_TO_BYTE (to);
5661 len_byte = to_byte - from_byte;
5662 TEMP_SET_PT_BOTH (from, from_byte);
5663 }
5664 }
5665
5666 if (replace)
5667 {
5668 if (! EQ (current_buffer->undo_list, Qt))
5669 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5670 else
5671 {
5672 nchars_del = to - from;
5673 nbytes_del = to_byte - from_byte;
5674 }
5675 }
5676
5677 if (coding->composing != COMPOSITION_DISABLED)
5678 {
5679 if (encodep)
5680 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5681 else
5682 coding_allocate_composition_data (coding, from);
5683 }
5684
5685 /* Try to skip the heading and tailing ASCIIs. We can't skip them
5686 if we must run CCL program or there are compositions to
5687 encode. */
5688 if (coding->type != coding_type_ccl
5689 && (! coding->cmp_data || coding->cmp_data->used == 0))
5690 {
5691 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5692
5693 if (from < GPT && GPT < to)
5694 move_gap_both (from, from_byte);
5695 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5696 if (from_byte == to_byte
5697 && (encodep || NILP (coding->post_read_conversion))
5698 && ! CODING_REQUIRE_FLUSHING (coding))
5699 {
5700 coding->produced = len_byte;
5701 coding->produced_char = len;
5702 if (!replace)
5703 /* We must record and adjust for this new text now. */
5704 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5705 coding_free_composition_data (coding);
5706 return 0;
5707 }
5708
5709 head_skip = from_byte - from_byte_orig;
5710 tail_skip = to_byte_orig - to_byte;
5711 total_skip = head_skip + tail_skip;
5712 from += head_skip;
5713 to -= tail_skip;
5714 len -= total_skip; len_byte -= total_skip;
5715 }
5716
5717 /* For conversion, we must put the gap before the text in addition to
5718 making the gap larger for efficient decoding. The required gap
5719 size starts from 2000 which is the magic number used in make_gap.
5720 But, after one batch of conversion, it will be incremented if we
5721 find that it is not enough . */
5722 require = 2000;
5723
5724 if (GAP_SIZE < require)
5725 make_gap (require - GAP_SIZE);
5726 move_gap_both (from, from_byte);
5727
5728 inserted = inserted_byte = 0;
5729
5730 GAP_SIZE += len_byte;
5731 ZV -= len;
5732 Z -= len;
5733 ZV_BYTE -= len_byte;
5734 Z_BYTE -= len_byte;
5735
5736 if (GPT - BEG < BEG_UNCHANGED)
5737 BEG_UNCHANGED = GPT - BEG;
5738 if (Z - GPT < END_UNCHANGED)
5739 END_UNCHANGED = Z - GPT;
5740
5741 if (!encodep && coding->src_multibyte)
5742 {
5743 /* Decoding routines expects that the source text is unibyte.
5744 We must convert 8-bit characters of multibyte form to
5745 unibyte. */
5746 int len_byte_orig = len_byte;
5747 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5748 if (len_byte < len_byte_orig)
5749 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5750 len_byte);
5751 coding->src_multibyte = 0;
5752 }
5753
5754 for (;;)
5755 {
5756 int result;
5757
5758 /* The buffer memory is now:
5759 +--------+converted-text+---------+-------original-text-------+---+
5760 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5761 |<---------------------- GAP ----------------------->| */
5762 src = GAP_END_ADDR - len_byte;
5763 dst = GPT_ADDR + inserted_byte;
5764
5765 if (encodep)
5766 result = encode_coding (coding, src, dst, len_byte, 0);
5767 else
5768 {
5769 if (coding->composing != COMPOSITION_DISABLED)
5770 coding->cmp_data->char_offset = from + inserted;
5771 result = decode_coding (coding, src, dst, len_byte, 0);
5772 }
5773
5774 /* The buffer memory is now:
5775 +--------+-------converted-text----+--+------original-text----+---+
5776 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5777 |<---------------------- GAP ----------------------->| */
5778
5779 inserted += coding->produced_char;
5780 inserted_byte += coding->produced;
5781 len_byte -= coding->consumed;
5782
5783 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5784 {
5785 coding_allocate_composition_data (coding, from + inserted);
5786 continue;
5787 }
5788
5789 src += coding->consumed;
5790 dst += coding->produced;
5791
5792 if (result == CODING_FINISH_NORMAL)
5793 {
5794 src += len_byte;
5795 break;
5796 }
5797 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5798 {
5799 unsigned char *pend = dst, *p = pend - inserted_byte;
5800 Lisp_Object eol_type;
5801
5802 /* Encode LFs back to the original eol format (CR or CRLF). */
5803 if (coding->eol_type == CODING_EOL_CR)
5804 {
5805 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5806 }
5807 else
5808 {
5809 int count = 0;
5810
5811 while (p < pend) if (*p++ == '\n') count++;
5812 if (src - dst < count)
5813 {
5814 /* We don't have sufficient room for encoding LFs
5815 back to CRLF. We must record converted and
5816 not-yet-converted text back to the buffer
5817 content, enlarge the gap, then record them out of
5818 the buffer contents again. */
5819 int add = len_byte + inserted_byte;
5820
5821 GAP_SIZE -= add;
5822 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5823 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5824 make_gap (count - GAP_SIZE);
5825 GAP_SIZE += add;
5826 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5827 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5828 /* Don't forget to update SRC, DST, and PEND. */
5829 src = GAP_END_ADDR - len_byte;
5830 dst = GPT_ADDR + inserted_byte;
5831 pend = dst;
5832 }
5833 inserted += count;
5834 inserted_byte += count;
5835 coding->produced += count;
5836 p = dst = pend + count;
5837 while (count)
5838 {
5839 *--p = *--pend;
5840 if (*p == '\n') count--, *--p = '\r';
5841 }
5842 }
5843
5844 /* Suppress eol-format conversion in the further conversion. */
5845 coding->eol_type = CODING_EOL_LF;
5846
5847 /* Set the coding system symbol to that for Unix-like EOL. */
5848 eol_type = Fget (saved_coding_symbol, Qeol_type);
5849 if (VECTORP (eol_type)
5850 && XVECTOR (eol_type)->size == 3
5851 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5852 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5853 else
5854 coding->symbol = saved_coding_symbol;
5855
5856 continue;
5857 }
5858 if (len_byte <= 0)
5859 {
5860 if (coding->type != coding_type_ccl
5861 || coding->mode & CODING_MODE_LAST_BLOCK)
5862 break;
5863 coding->mode |= CODING_MODE_LAST_BLOCK;
5864 continue;
5865 }
5866 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5867 {
5868 /* The source text ends in invalid codes. Let's just
5869 make them valid buffer contents, and finish conversion. */
5870 if (multibyte_p)
5871 {
5872 unsigned char *start = dst;
5873
5874 inserted += len_byte;
5875 while (len_byte--)
5876 {
5877 int c = *src++;
5878 dst += CHAR_STRING (c, dst);
5879 }
5880
5881 inserted_byte += dst - start;
5882 }
5883 else
5884 {
5885 inserted += len_byte;
5886 inserted_byte += len_byte;
5887 while (len_byte--)
5888 *dst++ = *src++;
5889 }
5890 break;
5891 }
5892 if (result == CODING_FINISH_INTERRUPT)
5893 {
5894 /* The conversion procedure was interrupted by a user. */
5895 break;
5896 }
5897 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5898 if (coding->consumed < 1)
5899 {
5900 /* It's quite strange to require more memory without
5901 consuming any bytes. Perhaps CCL program bug. */
5902 break;
5903 }
5904 if (first)
5905 {
5906 /* We have just done the first batch of conversion which was
5907 stopped because of insufficient gap. Let's reconsider the
5908 required gap size (i.e. SRT - DST) now.
5909
5910 We have converted ORIG bytes (== coding->consumed) into
5911 NEW bytes (coding->produced). To convert the remaining
5912 LEN bytes, we may need REQUIRE bytes of gap, where:
5913 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5914 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5915 Here, we are sure that NEW >= ORIG. */
5916
5917 if (coding->produced <= coding->consumed)
5918 {
5919 /* This happens because of CCL-based coding system with
5920 eol-type CRLF. */
5921 require = 0;
5922 }
5923 else
5924 {
5925 float ratio = coding->produced - coding->consumed;
5926 ratio /= coding->consumed;
5927 require = len_byte * ratio;
5928 }
5929 first = 0;
5930 }
5931 if ((src - dst) < (require + 2000))
5932 {
5933 /* See the comment above the previous call of make_gap. */
5934 int add = len_byte + inserted_byte;
5935
5936 GAP_SIZE -= add;
5937 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5938 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5939 make_gap (require + 2000);
5940 GAP_SIZE += add;
5941 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5942 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5943 }
5944 }
5945 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5946
5947 if (encodep && coding->dst_multibyte)
5948 {
5949 /* The output is unibyte. We must convert 8-bit characters to
5950 multibyte form. */
5951 if (inserted_byte * 2 > GAP_SIZE)
5952 {
5953 GAP_SIZE -= inserted_byte;
5954 ZV += inserted_byte; Z += inserted_byte;
5955 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5956 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5957 make_gap (inserted_byte - GAP_SIZE);
5958 GAP_SIZE += inserted_byte;
5959 ZV -= inserted_byte; Z -= inserted_byte;
5960 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5961 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5962 }
5963 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5964 }
5965
5966 /* If we shrank the conversion area, adjust it now. */
5967 if (total_skip > 0)
5968 {
5969 if (tail_skip > 0)
5970 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5971 inserted += total_skip; inserted_byte += total_skip;
5972 GAP_SIZE += total_skip;
5973 GPT -= head_skip; GPT_BYTE -= head_skip;
5974 ZV -= total_skip; ZV_BYTE -= total_skip;
5975 Z -= total_skip; Z_BYTE -= total_skip;
5976 from -= head_skip; from_byte -= head_skip;
5977 to += tail_skip; to_byte += tail_skip;
5978 }
5979
5980 prev_Z = Z;
5981 if (! EQ (current_buffer->undo_list, Qt))
5982 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5983 else
5984 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5985 inserted, inserted_byte);
5986 inserted = Z - prev_Z;
5987
5988 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5989 coding_restore_composition (coding, Fcurrent_buffer ());
5990 coding_free_composition_data (coding);
5991
5992 if (! inhibit_pre_post_conversion
5993 && ! encodep && ! NILP (coding->post_read_conversion))
5994 {
5995 Lisp_Object val;
5996 Lisp_Object saved_coding_system;
5997
5998 if (from != PT)
5999 TEMP_SET_PT_BOTH (from, from_byte);
6000 prev_Z = Z;
6001 record_unwind_protect (code_convert_region_unwind,
6002 Fcons (Vlast_coding_system_used, Qnil));
6003 saved_coding_system = Vlast_coding_system_used;
6004 Vlast_coding_system_used = coding->symbol;
6005 /* We should not call any more pre-write/post-read-conversion
6006 functions while this post-read-conversion is running. */
6007 inhibit_pre_post_conversion = 1;
6008 val = call1 (coding->post_read_conversion, make_number (inserted));
6009 inhibit_pre_post_conversion = 0;
6010 coding->symbol = Vlast_coding_system_used;
6011 Vlast_coding_system_used = saved_coding_system;
6012 /* Discard the unwind protect. */
6013 specpdl_ptr--;
6014 CHECK_NUMBER (val);
6015 inserted += Z - prev_Z;
6016 }
6017
6018 if (orig_point >= from)
6019 {
6020 if (orig_point >= from + orig_len)
6021 orig_point += inserted - orig_len;
6022 else
6023 orig_point = from;
6024 TEMP_SET_PT (orig_point);
6025 }
6026
6027 if (replace)
6028 {
6029 signal_after_change (from, to - from, inserted);
6030 update_compositions (from, from + inserted, CHECK_BORDER);
6031 }
6032
6033 {
6034 coding->consumed = to_byte - from_byte;
6035 coding->consumed_char = to - from;
6036 coding->produced = inserted_byte;
6037 coding->produced_char = inserted;
6038 }
6039
6040 return 0;
6041 }
6042
6043 /* Name (or base name) of work buffer for code conversion. */
6044 static Lisp_Object Vcode_conversion_workbuf_name;
6045
6046 /* Set the current buffer to the working buffer prepared for
6047 code-conversion. MULTIBYTE specifies the multibyteness of the
6048 buffer. Return the buffer we set if it must be killed after use.
6049 Otherwise return Qnil. */
6050
6051 static Lisp_Object
6052 set_conversion_work_buffer (multibyte)
6053 int multibyte;
6054 {
6055 Lisp_Object buffer, buffer_to_kill;
6056 struct buffer *buf;
6057
6058 buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6059 buf = XBUFFER (buffer);
6060 if (buf == current_buffer)
6061 {
6062 /* As we are already in the work buffer, we must generate a new
6063 buffer for the work. */
6064 Lisp_Object name;
6065
6066 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6067 buffer = buffer_to_kill = Fget_buffer_create (name);
6068 buf = XBUFFER (buffer);
6069 }
6070 else
6071 buffer_to_kill = Qnil;
6072
6073 delete_all_overlays (buf);
6074 buf->directory = current_buffer->directory;
6075 buf->read_only = Qnil;
6076 buf->filename = Qnil;
6077 buf->undo_list = Qt;
6078 eassert (buf->overlays_before == NULL);
6079 eassert (buf->overlays_after == NULL);
6080 set_buffer_internal (buf);
6081 if (BEG != BEGV || Z != ZV)
6082 Fwiden ();
6083 del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6084 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6085 return buffer_to_kill;
6086 }
6087
6088 Lisp_Object
6089 run_pre_post_conversion_on_str (str, coding, encodep)
6090 Lisp_Object str;
6091 struct coding_system *coding;
6092 int encodep;
6093 {
6094 int count = SPECPDL_INDEX ();
6095 struct gcpro gcpro1, gcpro2;
6096 int multibyte = STRING_MULTIBYTE (str);
6097 Lisp_Object old_deactivate_mark;
6098 Lisp_Object buffer_to_kill;
6099 Lisp_Object unwind_arg;
6100
6101 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6102 /* It is not crucial to specbind this. */
6103 old_deactivate_mark = Vdeactivate_mark;
6104 GCPRO2 (str, old_deactivate_mark);
6105
6106 /* We must insert the contents of STR as is without
6107 unibyte<->multibyte conversion. For that, we adjust the
6108 multibyteness of the working buffer to that of STR. */
6109 buffer_to_kill = set_conversion_work_buffer (multibyte);
6110 if (NILP (buffer_to_kill))
6111 unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6112 else
6113 unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6114 record_unwind_protect (code_convert_region_unwind, unwind_arg);
6115
6116 insert_from_string (str, 0, 0,
6117 SCHARS (str), SBYTES (str), 0);
6118 UNGCPRO;
6119 inhibit_pre_post_conversion = 1;
6120 if (encodep)
6121 {
6122 struct buffer *prev = current_buffer;
6123
6124 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6125 if (prev != current_buffer)
6126 /* We must kill the current buffer too. */
6127 Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6128 }
6129 else
6130 {
6131 Vlast_coding_system_used = coding->symbol;
6132 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6133 call1 (coding->post_read_conversion, make_number (Z - BEG));
6134 coding->symbol = Vlast_coding_system_used;
6135 }
6136 inhibit_pre_post_conversion = 0;
6137 Vdeactivate_mark = old_deactivate_mark;
6138 str = make_buffer_string (BEG, Z, 1);
6139 return unbind_to (count, str);
6140 }
6141
6142
6143 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6144 text in *STR. *SIZE is the allocated bytes for STR. As it
6145 is intended that this function is called from encode_terminal_code,
6146 the pre-write-conversion function is run by safe_call and thus
6147 "Error during redisplay: ..." is logged when an error occurs.
6148
6149 Store the resulting text in *STR and set CODING->produced_char and
6150 CODING->produced to the number of characters and bytes
6151 respectively. If the size of *STR is too small, enlarge it by
6152 xrealloc and update *STR and *SIZE. */
6153
6154 void
6155 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6156 unsigned char **str;
6157 int *size, nchars, nbytes;
6158 struct coding_system *coding;
6159 {
6160 struct gcpro gcpro1, gcpro2;
6161 struct buffer *cur = current_buffer;
6162 struct buffer *prev;
6163 Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6164 Lisp_Object args[3];
6165 Lisp_Object buffer_to_kill;
6166
6167 /* It is not crucial to specbind this. */
6168 old_deactivate_mark = Vdeactivate_mark;
6169 old_last_coding_system_used = Vlast_coding_system_used;
6170 GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6171
6172 /* We must insert the contents of STR as is without
6173 unibyte<->multibyte conversion. For that, we adjust the
6174 multibyteness of the working buffer to that of STR. */
6175 buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6176 insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6177 UNGCPRO;
6178 inhibit_pre_post_conversion = 1;
6179 prev = current_buffer;
6180 args[0] = coding->pre_write_conversion;
6181 args[1] = make_number (BEG);
6182 args[2] = make_number (Z);
6183 safe_call (3, args);
6184 inhibit_pre_post_conversion = 0;
6185 Vdeactivate_mark = old_deactivate_mark;
6186 Vlast_coding_system_used = old_last_coding_system_used;
6187 coding->produced_char = Z - BEG;
6188 coding->produced = Z_BYTE - BEG_BYTE;
6189 if (coding->produced > *size)
6190 {
6191 *size = coding->produced;
6192 *str = xrealloc (*str, *size);
6193 }
6194 if (BEG < GPT && GPT < Z)
6195 move_gap (BEG);
6196 bcopy (BEG_ADDR, *str, coding->produced);
6197 coding->src_multibyte
6198 = ! NILP (current_buffer->enable_multibyte_characters);
6199 if (prev != current_buffer)
6200 Fkill_buffer (Fcurrent_buffer ());
6201 set_buffer_internal (cur);
6202 if (! NILP (buffer_to_kill))
6203 Fkill_buffer (buffer_to_kill);
6204 }
6205
6206
6207 Lisp_Object
6208 decode_coding_string (str, coding, nocopy)
6209 Lisp_Object str;
6210 struct coding_system *coding;
6211 int nocopy;
6212 {
6213 int len;
6214 struct conversion_buffer buf;
6215 int from, to_byte;
6216 Lisp_Object saved_coding_symbol;
6217 int result;
6218 int require_decoding;
6219 int shrinked_bytes = 0;
6220 Lisp_Object newstr;
6221 int consumed, consumed_char, produced, produced_char;
6222
6223 from = 0;
6224 to_byte = SBYTES (str);
6225
6226 saved_coding_symbol = coding->symbol;
6227 coding->src_multibyte = STRING_MULTIBYTE (str);
6228 coding->dst_multibyte = 1;
6229 coding->heading_ascii = 0;
6230
6231 if (CODING_REQUIRE_DETECTION (coding))
6232 {
6233 /* See the comments in code_convert_region. */
6234 if (coding->type == coding_type_undecided)
6235 {
6236 detect_coding (coding, SDATA (str), to_byte);
6237 if (coding->type == coding_type_undecided)
6238 {
6239 coding->type = coding_type_emacs_mule;
6240 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6241 /* As emacs-mule decoder will handle composition, we
6242 need this setting to allocate coding->cmp_data
6243 later. */
6244 coding->composing = COMPOSITION_NO;
6245 }
6246 }
6247 if (coding->eol_type == CODING_EOL_UNDECIDED
6248 && coding->type != coding_type_ccl)
6249 {
6250 saved_coding_symbol = coding->symbol;
6251 detect_eol (coding, SDATA (str), to_byte);
6252 if (coding->eol_type == CODING_EOL_UNDECIDED)
6253 coding->eol_type = CODING_EOL_LF;
6254 /* We had better recover the original eol format if we
6255 encounter an inconsistent eol format while decoding. */
6256 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6257 }
6258 }
6259
6260 if (coding->type == coding_type_no_conversion
6261 || coding->type == coding_type_raw_text)
6262 coding->dst_multibyte = 0;
6263
6264 require_decoding = CODING_REQUIRE_DECODING (coding);
6265
6266 if (STRING_MULTIBYTE (str))
6267 {
6268 /* Decoding routines expect the source text to be unibyte. */
6269 str = Fstring_as_unibyte (str);
6270 to_byte = SBYTES (str);
6271 nocopy = 1;
6272 coding->src_multibyte = 0;
6273 }
6274
6275 /* Try to skip the heading and tailing ASCIIs. */
6276 if (require_decoding && coding->type != coding_type_ccl)
6277 {
6278 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6279 0);
6280 if (from == to_byte)
6281 require_decoding = 0;
6282 shrinked_bytes = from + (SBYTES (str) - to_byte);
6283 }
6284
6285 if (!require_decoding
6286 && !(SYMBOLP (coding->post_read_conversion)
6287 && !NILP (Ffboundp (coding->post_read_conversion))))
6288 {
6289 coding->consumed = SBYTES (str);
6290 coding->consumed_char = SCHARS (str);
6291 if (coding->dst_multibyte)
6292 {
6293 str = Fstring_as_multibyte (str);
6294 nocopy = 1;
6295 }
6296 coding->produced = SBYTES (str);
6297 coding->produced_char = SCHARS (str);
6298 return (nocopy ? str : Fcopy_sequence (str));
6299 }
6300
6301 if (coding->composing != COMPOSITION_DISABLED)
6302 coding_allocate_composition_data (coding, from);
6303 len = decoding_buffer_size (coding, to_byte - from);
6304 allocate_conversion_buffer (buf, len);
6305
6306 consumed = consumed_char = produced = produced_char = 0;
6307 while (1)
6308 {
6309 result = decode_coding (coding, SDATA (str) + from + consumed,
6310 buf.data + produced, to_byte - from - consumed,
6311 buf.size - produced);
6312 consumed += coding->consumed;
6313 consumed_char += coding->consumed_char;
6314 produced += coding->produced;
6315 produced_char += coding->produced_char;
6316 if (result == CODING_FINISH_NORMAL
6317 || result == CODING_FINISH_INTERRUPT
6318 || (result == CODING_FINISH_INSUFFICIENT_SRC
6319 && coding->consumed == 0))
6320 break;
6321 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6322 coding_allocate_composition_data (coding, from + produced_char);
6323 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6324 extend_conversion_buffer (&buf);
6325 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6326 {
6327 Lisp_Object eol_type;
6328
6329 /* Recover the original EOL format. */
6330 if (coding->eol_type == CODING_EOL_CR)
6331 {
6332 unsigned char *p;
6333 for (p = buf.data; p < buf.data + produced; p++)
6334 if (*p == '\n') *p = '\r';
6335 }
6336 else if (coding->eol_type == CODING_EOL_CRLF)
6337 {
6338 int num_eol = 0;
6339 unsigned char *p0, *p1;
6340 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6341 if (*p0 == '\n') num_eol++;
6342 if (produced + num_eol >= buf.size)
6343 extend_conversion_buffer (&buf);
6344 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6345 {
6346 *--p1 = *--p0;
6347 if (*p0 == '\n') *--p1 = '\r';
6348 }
6349 produced += num_eol;
6350 produced_char += num_eol;
6351 }
6352 /* Suppress eol-format conversion in the further conversion. */
6353 coding->eol_type = CODING_EOL_LF;
6354
6355 /* Set the coding system symbol to that for Unix-like EOL. */
6356 eol_type = Fget (saved_coding_symbol, Qeol_type);
6357 if (VECTORP (eol_type)
6358 && XVECTOR (eol_type)->size == 3
6359 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6360 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6361 else
6362 coding->symbol = saved_coding_symbol;
6363
6364
6365 }
6366 }
6367
6368 coding->consumed = consumed;
6369 coding->consumed_char = consumed_char;
6370 coding->produced = produced;
6371 coding->produced_char = produced_char;
6372
6373 if (coding->dst_multibyte)
6374 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6375 produced + shrinked_bytes);
6376 else
6377 newstr = make_uninit_string (produced + shrinked_bytes);
6378 if (from > 0)
6379 STRING_COPYIN (newstr, 0, SDATA (str), from);
6380 STRING_COPYIN (newstr, from, buf.data, produced);
6381 if (shrinked_bytes > from)
6382 STRING_COPYIN (newstr, from + produced,
6383 SDATA (str) + to_byte,
6384 shrinked_bytes - from);
6385 free_conversion_buffer (&buf);
6386
6387 coding->consumed += shrinked_bytes;
6388 coding->consumed_char += shrinked_bytes;
6389 coding->produced += shrinked_bytes;
6390 coding->produced_char += shrinked_bytes;
6391
6392 if (coding->cmp_data && coding->cmp_data->used)
6393 coding_restore_composition (coding, newstr);
6394 coding_free_composition_data (coding);
6395
6396 if (SYMBOLP (coding->post_read_conversion)
6397 && !NILP (Ffboundp (coding->post_read_conversion)))
6398 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6399
6400 return newstr;
6401 }
6402
6403 Lisp_Object
6404 encode_coding_string (str, coding, nocopy)
6405 Lisp_Object str;
6406 struct coding_system *coding;
6407 int nocopy;
6408 {
6409 int len;
6410 struct conversion_buffer buf;
6411 int from, to, to_byte;
6412 int result;
6413 int shrinked_bytes = 0;
6414 Lisp_Object newstr;
6415 int consumed, consumed_char, produced, produced_char;
6416
6417 if (SYMBOLP (coding->pre_write_conversion)
6418 && !NILP (Ffboundp (coding->pre_write_conversion)))
6419 {
6420 str = run_pre_post_conversion_on_str (str, coding, 1);
6421 /* As STR is just newly generated, we don't have to copy it
6422 anymore. */
6423 nocopy = 1;
6424 }
6425
6426 from = 0;
6427 to = SCHARS (str);
6428 to_byte = SBYTES (str);
6429
6430 /* Encoding routines determine the multibyteness of the source text
6431 by coding->src_multibyte. */
6432 coding->src_multibyte = SCHARS (str) < SBYTES (str);
6433 coding->dst_multibyte = 0;
6434 if (! CODING_REQUIRE_ENCODING (coding))
6435 goto no_need_of_encoding;
6436
6437 if (coding->composing != COMPOSITION_DISABLED)
6438 coding_save_composition (coding, from, to, str);
6439
6440 /* Try to skip the heading and tailing ASCIIs. We can't skip them
6441 if we must run CCL program or there are compositions to
6442 encode. */
6443 coding->heading_ascii = 0;
6444 if (coding->type != coding_type_ccl
6445 && (! coding->cmp_data || coding->cmp_data->used == 0))
6446 {
6447 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6448 1);
6449 if (from == to_byte)
6450 {
6451 coding_free_composition_data (coding);
6452 goto no_need_of_encoding;
6453 }
6454 shrinked_bytes = from + (SBYTES (str) - to_byte);
6455 }
6456
6457 len = encoding_buffer_size (coding, to_byte - from);
6458 allocate_conversion_buffer (buf, len);
6459
6460 consumed = consumed_char = produced = produced_char = 0;
6461 while (1)
6462 {
6463 result = encode_coding (coding, SDATA (str) + from + consumed,
6464 buf.data + produced, to_byte - from - consumed,
6465 buf.size - produced);
6466 consumed += coding->consumed;
6467 consumed_char += coding->consumed_char;
6468 produced += coding->produced;
6469 produced_char += coding->produced_char;
6470 if (result == CODING_FINISH_NORMAL
6471 || result == CODING_FINISH_INTERRUPT
6472 || (result == CODING_FINISH_INSUFFICIENT_SRC
6473 && coding->consumed == 0))
6474 break;
6475 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6476 extend_conversion_buffer (&buf);
6477 }
6478
6479 coding->consumed = consumed;
6480 coding->consumed_char = consumed_char;
6481 coding->produced = produced;
6482 coding->produced_char = produced_char;
6483
6484 newstr = make_uninit_string (produced + shrinked_bytes);
6485 if (from > 0)
6486 STRING_COPYIN (newstr, 0, SDATA (str), from);
6487 STRING_COPYIN (newstr, from, buf.data, produced);
6488 if (shrinked_bytes > from)
6489 STRING_COPYIN (newstr, from + produced,
6490 SDATA (str) + to_byte,
6491 shrinked_bytes - from);
6492
6493 free_conversion_buffer (&buf);
6494 coding_free_composition_data (coding);
6495
6496 return newstr;
6497
6498 no_need_of_encoding:
6499 coding->consumed = SBYTES (str);
6500 coding->consumed_char = SCHARS (str);
6501 if (STRING_MULTIBYTE (str))
6502 {
6503 if (nocopy)
6504 /* We are sure that STR doesn't contain a multibyte
6505 character. */
6506 STRING_SET_UNIBYTE (str);
6507 else
6508 {
6509 str = Fstring_as_unibyte (str);
6510 nocopy = 1;
6511 }
6512 }
6513 coding->produced = SBYTES (str);
6514 coding->produced_char = SCHARS (str);
6515 return (nocopy ? str : Fcopy_sequence (str));
6516 }
6517
6518 \f
6519 #ifdef emacs
6520 /*** 8. Emacs Lisp library functions ***/
6521
6522 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6523 doc: /* Return t if OBJECT is nil or a coding-system.
6524 See the documentation of `make-coding-system' for information
6525 about coding-system objects. */)
6526 (obj)
6527 Lisp_Object obj;
6528 {
6529 if (NILP (obj))
6530 return Qt;
6531 if (!SYMBOLP (obj))
6532 return Qnil;
6533 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6534 return Qt;
6535 /* Get coding-spec vector for OBJ. */
6536 obj = Fget (obj, Qcoding_system);
6537 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6538 ? Qt : Qnil);
6539 }
6540
6541 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6542 Sread_non_nil_coding_system, 1, 1, 0,
6543 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6544 (prompt)
6545 Lisp_Object prompt;
6546 {
6547 Lisp_Object val;
6548 do
6549 {
6550 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6551 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6552 }
6553 while (SCHARS (val) == 0);
6554 return (Fintern (val, Qnil));
6555 }
6556
6557 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6558 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6559 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
6560 Ignores case when completing coding systems (all Emacs coding systems
6561 are lower-case). */)
6562 (prompt, default_coding_system)
6563 Lisp_Object prompt, default_coding_system;
6564 {
6565 Lisp_Object val;
6566 int count = SPECPDL_INDEX ();
6567
6568 if (SYMBOLP (default_coding_system))
6569 default_coding_system = SYMBOL_NAME (default_coding_system);
6570 specbind (Qcompletion_ignore_case, Qt);
6571 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6572 Qt, Qnil, Qcoding_system_history,
6573 default_coding_system, Qnil);
6574 unbind_to (count, Qnil);
6575 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6576 }
6577
6578 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6579 1, 1, 0,
6580 doc: /* Check validity of CODING-SYSTEM.
6581 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6582 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6583 The value of this property should be a vector of length 5. */)
6584 (coding_system)
6585 Lisp_Object coding_system;
6586 {
6587 Lisp_Object define_form;
6588
6589 define_form = Fget (coding_system, Qcoding_system_define_form);
6590 if (! NILP (define_form))
6591 {
6592 Fput (coding_system, Qcoding_system_define_form, Qnil);
6593 safe_eval (define_form);
6594 }
6595 if (!NILP (Fcoding_system_p (coding_system)))
6596 return coding_system;
6597 xsignal1 (Qcoding_system_error, coding_system);
6598 }
6599 \f
6600 Lisp_Object
6601 detect_coding_system (src, src_bytes, highest, multibytep)
6602 const unsigned char *src;
6603 int src_bytes, highest;
6604 int multibytep;
6605 {
6606 int coding_mask, eol_type;
6607 Lisp_Object val, tmp;
6608 int dummy;
6609
6610 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6611 eol_type = detect_eol_type (src, src_bytes, &dummy);
6612 if (eol_type == CODING_EOL_INCONSISTENT)
6613 eol_type = CODING_EOL_UNDECIDED;
6614
6615 if (!coding_mask)
6616 {
6617 val = Qundecided;
6618 if (eol_type != CODING_EOL_UNDECIDED)
6619 {
6620 Lisp_Object val2;
6621 val2 = Fget (Qundecided, Qeol_type);
6622 if (VECTORP (val2))
6623 val = XVECTOR (val2)->contents[eol_type];
6624 }
6625 return (highest ? val : Fcons (val, Qnil));
6626 }
6627
6628 /* At first, gather possible coding systems in VAL. */
6629 val = Qnil;
6630 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6631 {
6632 Lisp_Object category_val, category_index;
6633
6634 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6635 category_val = Fsymbol_value (XCAR (tmp));
6636 if (!NILP (category_val)
6637 && NATNUMP (category_index)
6638 && (coding_mask & (1 << XFASTINT (category_index))))
6639 {
6640 val = Fcons (category_val, val);
6641 if (highest)
6642 break;
6643 }
6644 }
6645 if (!highest)
6646 val = Fnreverse (val);
6647
6648 /* Then, replace the elements with subsidiary coding systems. */
6649 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6650 {
6651 if (eol_type != CODING_EOL_UNDECIDED
6652 && eol_type != CODING_EOL_INCONSISTENT)
6653 {
6654 Lisp_Object eol;
6655 eol = Fget (XCAR (tmp), Qeol_type);
6656 if (VECTORP (eol))
6657 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6658 }
6659 }
6660 return (highest ? XCAR (val) : val);
6661 }
6662
6663 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6664 2, 3, 0,
6665 doc: /* Detect how the byte sequence in the region is encoded.
6666 Return a list of possible coding systems used on decoding a byte
6667 sequence containing the bytes in the region between START and END when
6668 the coding system `undecided' is specified. The list is ordered by
6669 priority decided in the current language environment.
6670
6671 If only ASCII characters are found (except for such ISO-2022 control
6672 characters ISO-2022 as ESC), it returns a list of single element
6673 `undecided' or its subsidiary coding system according to a detected
6674 end-of-line format.
6675
6676 If optional argument HIGHEST is non-nil, return the coding system of
6677 highest priority. */)
6678 (start, end, highest)
6679 Lisp_Object start, end, highest;
6680 {
6681 int from, to;
6682 int from_byte, to_byte;
6683 int include_anchor_byte = 0;
6684
6685 CHECK_NUMBER_COERCE_MARKER (start);
6686 CHECK_NUMBER_COERCE_MARKER (end);
6687
6688 validate_region (&start, &end);
6689 from = XINT (start), to = XINT (end);
6690 from_byte = CHAR_TO_BYTE (from);
6691 to_byte = CHAR_TO_BYTE (to);
6692
6693 if (from < GPT && to >= GPT)
6694 move_gap_both (to, to_byte);
6695 /* If we an anchor byte `\0' follows the region, we include it in
6696 the detecting source. Then code detectors can handle the tailing
6697 byte sequence more accurately.
6698
6699 Fix me: This is not a perfect solution. It is better that we
6700 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6701 */
6702 if (to == Z || (to == GPT && GAP_SIZE > 0))
6703 include_anchor_byte = 1;
6704 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6705 to_byte - from_byte + include_anchor_byte,
6706 !NILP (highest),
6707 !NILP (current_buffer
6708 ->enable_multibyte_characters));
6709 }
6710
6711 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6712 1, 2, 0,
6713 doc: /* Detect how the byte sequence in STRING is encoded.
6714 Return a list of possible coding systems used on decoding a byte
6715 sequence containing the bytes in STRING when the coding system
6716 `undecided' is specified. The list is ordered by priority decided in
6717 the current language environment.
6718
6719 If only ASCII characters are found (except for such ISO-2022 control
6720 characters ISO-2022 as ESC), it returns a list of single element
6721 `undecided' or its subsidiary coding system according to a detected
6722 end-of-line format.
6723
6724 If optional argument HIGHEST is non-nil, return the coding system of
6725 highest priority. */)
6726 (string, highest)
6727 Lisp_Object string, highest;
6728 {
6729 CHECK_STRING (string);
6730
6731 return detect_coding_system (SDATA (string),
6732 /* "+ 1" is to include the anchor byte
6733 `\0'. With this, code detectors can
6734 handle the tailing bytes more
6735 accurately. */
6736 SBYTES (string) + 1,
6737 !NILP (highest),
6738 STRING_MULTIBYTE (string));
6739 }
6740
6741 /* Subroutine for Ffind_coding_systems_region_internal.
6742
6743 Return a list of coding systems that safely encode the multibyte
6744 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6745 possible coding systems. If it is nil, it means that we have not
6746 yet found any coding systems.
6747
6748 WORK_TABLE a char-table of which element is set to t once the
6749 element is looked up.
6750
6751 If a non-ASCII single byte char is found, set
6752 *single_byte_char_found to 1. */
6753
6754 static Lisp_Object
6755 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6756 unsigned char *p, *pend;
6757 Lisp_Object safe_codings, work_table;
6758 int *single_byte_char_found;
6759 {
6760 int c, len;
6761 Lisp_Object val, ch;
6762 Lisp_Object prev, tail;
6763
6764 if (NILP (safe_codings))
6765 goto done_safe_codings;
6766 while (p < pend)
6767 {
6768 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6769 p += len;
6770 if (ASCII_BYTE_P (c))
6771 /* We can ignore ASCII characters here. */
6772 continue;
6773 if (SINGLE_BYTE_CHAR_P (c))
6774 *single_byte_char_found = 1;
6775 /* Check the safe coding systems for C. */
6776 ch = make_number (c);
6777 val = Faref (work_table, ch);
6778 if (EQ (val, Qt))
6779 /* This element was already checked. Ignore it. */
6780 continue;
6781 /* Remember that we checked this element. */
6782 Faset (work_table, ch, Qt);
6783
6784 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6785 {
6786 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6787 int encodable;
6788
6789 elt = XCAR (tail);
6790 if (CONSP (XCDR (elt)))
6791 {
6792 /* This entry has this format now:
6793 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6794 ACCEPT-LATIN-EXTRA ) */
6795 val = XCDR (elt);
6796 encodable = ! NILP (Faref (XCAR (val), ch));
6797 if (! encodable)
6798 {
6799 val = XCDR (val);
6800 translation_table = XCAR (val);
6801 hash_table = XCAR (XCDR (val));
6802 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6803 }
6804 }
6805 else
6806 {
6807 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6808 encodable = ! NILP (Faref (XCDR (elt), ch));
6809 if (! encodable)
6810 {
6811 /* Transform the format to:
6812 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6813 ACCEPT-LATIN-EXTRA ) */
6814 val = Fget (XCAR (elt), Qcoding_system);
6815 translation_table
6816 = Fplist_get (AREF (val, 3),
6817 Qtranslation_table_for_encode);
6818 if (SYMBOLP (translation_table))
6819 translation_table = Fget (translation_table,
6820 Qtranslation_table);
6821 hash_table
6822 = (CHAR_TABLE_P (translation_table)
6823 ? XCHAR_TABLE (translation_table)->extras[1]
6824 : Qnil);
6825 accept_latin_extra
6826 = ((EQ (AREF (val, 0), make_number (2))
6827 && VECTORP (AREF (val, 4)))
6828 ? AREF (AREF (val, 4), 16)
6829 : Qnil);
6830 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6831 translation_table, hash_table,
6832 accept_latin_extra));
6833 }
6834 }
6835
6836 if (! encodable
6837 && ((CHAR_TABLE_P (translation_table)
6838 && ! NILP (Faref (translation_table, ch)))
6839 || (HASH_TABLE_P (hash_table)
6840 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6841 || (SINGLE_BYTE_CHAR_P (c)
6842 && ! NILP (accept_latin_extra)
6843 && VECTORP (Vlatin_extra_code_table)
6844 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6845 encodable = 1;
6846 if (encodable)
6847 prev = tail;
6848 else
6849 {
6850 /* Exclude this coding system from SAFE_CODINGS. */
6851 if (EQ (tail, safe_codings))
6852 {
6853 safe_codings = XCDR (safe_codings);
6854 if (NILP (safe_codings))
6855 goto done_safe_codings;
6856 }
6857 else
6858 XSETCDR (prev, XCDR (tail));
6859 }
6860 }
6861 }
6862
6863 done_safe_codings:
6864 /* If the above loop was terminated before P reaches PEND, it means
6865 SAFE_CODINGS was set to nil. If we have not yet found an
6866 non-ASCII single-byte char, check it now. */
6867 if (! *single_byte_char_found)
6868 while (p < pend)
6869 {
6870 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6871 p += len;
6872 if (! ASCII_BYTE_P (c)
6873 && SINGLE_BYTE_CHAR_P (c))
6874 {
6875 *single_byte_char_found = 1;
6876 break;
6877 }
6878 }
6879 return safe_codings;
6880 }
6881
6882 DEFUN ("find-coding-systems-region-internal",
6883 Ffind_coding_systems_region_internal,
6884 Sfind_coding_systems_region_internal, 2, 2, 0,
6885 doc: /* Internal use only. */)
6886 (start, end)
6887 Lisp_Object start, end;
6888 {
6889 Lisp_Object work_table, safe_codings;
6890 int non_ascii_p = 0;
6891 int single_byte_char_found = 0;
6892 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6893
6894 if (STRINGP (start))
6895 {
6896 if (!STRING_MULTIBYTE (start))
6897 return Qt;
6898 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6899 p2 = p2end = p1end;
6900 if (SCHARS (start) != SBYTES (start))
6901 non_ascii_p = 1;
6902 }
6903 else
6904 {
6905 int from, to, stop;
6906
6907 CHECK_NUMBER_COERCE_MARKER (start);
6908 CHECK_NUMBER_COERCE_MARKER (end);
6909 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6910 args_out_of_range (start, end);
6911 if (NILP (current_buffer->enable_multibyte_characters))
6912 return Qt;
6913 from = CHAR_TO_BYTE (XINT (start));
6914 to = CHAR_TO_BYTE (XINT (end));
6915 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6916 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6917 if (stop == to)
6918 p2 = p2end = p1end;
6919 else
6920 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6921 if (XINT (end) - XINT (start) != to - from)
6922 non_ascii_p = 1;
6923 }
6924
6925 if (!non_ascii_p)
6926 {
6927 /* We are sure that the text contains no multibyte character.
6928 Check if it contains eight-bit-graphic. */
6929 p = p1;
6930 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6931 if (p == p1end)
6932 {
6933 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6934 if (p == p2end)
6935 return Qt;
6936 }
6937 }
6938
6939 /* The text contains non-ASCII characters. */
6940
6941 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6942 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6943
6944 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6945 &single_byte_char_found);
6946 if (p2 < p2end)
6947 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6948 &single_byte_char_found);
6949 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6950 safe_codings = Qt;
6951 else
6952 {
6953 /* Turn safe_codings to a list of coding systems... */
6954 Lisp_Object val;
6955
6956 if (single_byte_char_found)
6957 /* ... and append these for eight-bit chars. */
6958 val = Fcons (Qraw_text,
6959 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6960 else
6961 /* ... and append generic coding systems. */
6962 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6963
6964 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6965 val = Fcons (XCAR (XCAR (safe_codings)), val);
6966 safe_codings = val;
6967 }
6968
6969 return safe_codings;
6970 }
6971
6972
6973 /* Search from position POS for such characters that are unencodable
6974 accoding to SAFE_CHARS, and return a list of their positions. P
6975 points where in the memory the character at POS exists. Limit the
6976 search at PEND or when Nth unencodable characters are found.
6977
6978 If SAFE_CHARS is a char table, an element for an unencodable
6979 character is nil.
6980
6981 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6982
6983 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6984 eight-bit-graphic characters are unencodable. */
6985
6986 static Lisp_Object
6987 unencodable_char_position (safe_chars, pos, p, pend, n)
6988 Lisp_Object safe_chars;
6989 int pos;
6990 unsigned char *p, *pend;
6991 int n;
6992 {
6993 Lisp_Object pos_list;
6994
6995 pos_list = Qnil;
6996 while (p < pend)
6997 {
6998 int len;
6999 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
7000
7001 if (c >= 128
7002 && (CHAR_TABLE_P (safe_chars)
7003 ? NILP (CHAR_TABLE_REF (safe_chars, c))
7004 : (NILP (safe_chars) || c < 256)))
7005 {
7006 pos_list = Fcons (make_number (pos), pos_list);
7007 if (--n <= 0)
7008 break;
7009 }
7010 pos++;
7011 p += len;
7012 }
7013 return Fnreverse (pos_list);
7014 }
7015
7016
7017 DEFUN ("unencodable-char-position", Funencodable_char_position,
7018 Sunencodable_char_position, 3, 5, 0,
7019 doc: /*
7020 Return position of first un-encodable character in a region.
7021 START and END specfiy the region and CODING-SYSTEM specifies the
7022 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7023
7024 If optional 4th argument COUNT is non-nil, it specifies at most how
7025 many un-encodable characters to search. In this case, the value is a
7026 list of positions.
7027
7028 If optional 5th argument STRING is non-nil, it is a string to search
7029 for un-encodable characters. In that case, START and END are indexes
7030 to the string. */)
7031 (start, end, coding_system, count, string)
7032 Lisp_Object start, end, coding_system, count, string;
7033 {
7034 int n;
7035 Lisp_Object safe_chars;
7036 struct coding_system coding;
7037 Lisp_Object positions;
7038 int from, to;
7039 unsigned char *p, *pend;
7040
7041 if (NILP (string))
7042 {
7043 validate_region (&start, &end);
7044 from = XINT (start);
7045 to = XINT (end);
7046 if (NILP (current_buffer->enable_multibyte_characters))
7047 return Qnil;
7048 p = CHAR_POS_ADDR (from);
7049 if (to == GPT)
7050 pend = GPT_ADDR;
7051 else
7052 pend = CHAR_POS_ADDR (to);
7053 }
7054 else
7055 {
7056 CHECK_STRING (string);
7057 CHECK_NATNUM (start);
7058 CHECK_NATNUM (end);
7059 from = XINT (start);
7060 to = XINT (end);
7061 if (from > to
7062 || to > SCHARS (string))
7063 args_out_of_range_3 (string, start, end);
7064 if (! STRING_MULTIBYTE (string))
7065 return Qnil;
7066 p = SDATA (string) + string_char_to_byte (string, from);
7067 pend = SDATA (string) + string_char_to_byte (string, to);
7068 }
7069
7070 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7071
7072 if (NILP (count))
7073 n = 1;
7074 else
7075 {
7076 CHECK_NATNUM (count);
7077 n = XINT (count);
7078 }
7079
7080 if (coding.type == coding_type_no_conversion
7081 || coding.type == coding_type_raw_text)
7082 return Qnil;
7083
7084 if (coding.type == coding_type_undecided)
7085 safe_chars = Qnil;
7086 else
7087 safe_chars = coding_safe_chars (coding_system);
7088
7089 if (STRINGP (string)
7090 || from >= GPT || to <= GPT)
7091 positions = unencodable_char_position (safe_chars, from, p, pend, n);
7092 else
7093 {
7094 Lisp_Object args[2];
7095
7096 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7097 n -= XINT (Flength (args[0]));
7098 if (n <= 0)
7099 positions = args[0];
7100 else
7101 {
7102 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7103 pend, n);
7104 positions = Fappend (2, args);
7105 }
7106 }
7107
7108 return (NILP (count) ? Fcar (positions) : positions);
7109 }
7110
7111
7112 Lisp_Object
7113 code_convert_region1 (start, end, coding_system, encodep)
7114 Lisp_Object start, end, coding_system;
7115 int encodep;
7116 {
7117 struct coding_system coding;
7118 int from, to;
7119
7120 CHECK_NUMBER_COERCE_MARKER (start);
7121 CHECK_NUMBER_COERCE_MARKER (end);
7122 CHECK_SYMBOL (coding_system);
7123
7124 validate_region (&start, &end);
7125 from = XFASTINT (start);
7126 to = XFASTINT (end);
7127
7128 if (NILP (coding_system))
7129 return make_number (to - from);
7130
7131 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7132 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7133
7134 coding.mode |= CODING_MODE_LAST_BLOCK;
7135 coding.src_multibyte = coding.dst_multibyte
7136 = !NILP (current_buffer->enable_multibyte_characters);
7137 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7138 &coding, encodep, 1);
7139 Vlast_coding_system_used = coding.symbol;
7140 return make_number (coding.produced_char);
7141 }
7142
7143 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7144 3, 3, "r\nzCoding system: ",
7145 doc: /* Decode the current region from the specified coding system.
7146 When called from a program, takes three arguments:
7147 START, END, and CODING-SYSTEM. START and END are buffer positions.
7148 This function sets `last-coding-system-used' to the precise coding system
7149 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7150 not fully specified.)
7151 It returns the length of the decoded text. */)
7152 (start, end, coding_system)
7153 Lisp_Object start, end, coding_system;
7154 {
7155 return code_convert_region1 (start, end, coding_system, 0);
7156 }
7157
7158 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7159 3, 3, "r\nzCoding system: ",
7160 doc: /* Encode the current region into the specified coding system.
7161 When called from a program, takes three arguments:
7162 START, END, and CODING-SYSTEM. START and END are buffer positions.
7163 This function sets `last-coding-system-used' to the precise coding system
7164 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7165 not fully specified.)
7166 It returns the length of the encoded text. */)
7167 (start, end, coding_system)
7168 Lisp_Object start, end, coding_system;
7169 {
7170 return code_convert_region1 (start, end, coding_system, 1);
7171 }
7172
7173 Lisp_Object
7174 code_convert_string1 (string, coding_system, nocopy, encodep)
7175 Lisp_Object string, coding_system, nocopy;
7176 int encodep;
7177 {
7178 struct coding_system coding;
7179
7180 CHECK_STRING (string);
7181 CHECK_SYMBOL (coding_system);
7182
7183 if (NILP (coding_system))
7184 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7185
7186 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7187 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7188
7189 coding.mode |= CODING_MODE_LAST_BLOCK;
7190 string = (encodep
7191 ? encode_coding_string (string, &coding, !NILP (nocopy))
7192 : decode_coding_string (string, &coding, !NILP (nocopy)));
7193 Vlast_coding_system_used = coding.symbol;
7194
7195 return string;
7196 }
7197
7198 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7199 2, 3, 0,
7200 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7201 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7202 if the decoding operation is trivial.
7203 This function sets `last-coding-system-used' to the precise coding system
7204 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7205 not fully specified.) */)
7206 (string, coding_system, nocopy)
7207 Lisp_Object string, coding_system, nocopy;
7208 {
7209 return code_convert_string1 (string, coding_system, nocopy, 0);
7210 }
7211
7212 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7213 2, 3, 0,
7214 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7215 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7216 if the encoding operation is trivial.
7217 This function sets `last-coding-system-used' to the precise coding system
7218 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7219 not fully specified.) */)
7220 (string, coding_system, nocopy)
7221 Lisp_Object string, coding_system, nocopy;
7222 {
7223 return code_convert_string1 (string, coding_system, nocopy, 1);
7224 }
7225
7226 /* Encode or decode STRING according to CODING_SYSTEM.
7227 Do not set Vlast_coding_system_used.
7228
7229 This function is called only from macros DECODE_FILE and
7230 ENCODE_FILE, thus we ignore character composition. */
7231
7232 Lisp_Object
7233 code_convert_string_norecord (string, coding_system, encodep)
7234 Lisp_Object string, coding_system;
7235 int encodep;
7236 {
7237 struct coding_system coding;
7238
7239 CHECK_STRING (string);
7240 CHECK_SYMBOL (coding_system);
7241
7242 if (NILP (coding_system))
7243 return string;
7244
7245 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7246 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7247
7248 coding.composing = COMPOSITION_DISABLED;
7249 coding.mode |= CODING_MODE_LAST_BLOCK;
7250 return (encodep
7251 ? encode_coding_string (string, &coding, 1)
7252 : decode_coding_string (string, &coding, 1));
7253 }
7254 \f
7255 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7256 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7257 Return the corresponding character. */)
7258 (code)
7259 Lisp_Object code;
7260 {
7261 unsigned char c1, c2, s1, s2;
7262 Lisp_Object val;
7263
7264 CHECK_NUMBER (code);
7265 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7266 if (s1 == 0)
7267 {
7268 if (s2 < 0x80)
7269 XSETFASTINT (val, s2);
7270 else if (s2 >= 0xA0 || s2 <= 0xDF)
7271 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7272 else
7273 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7274 }
7275 else
7276 {
7277 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7278 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7279 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7280 DECODE_SJIS (s1, s2, c1, c2);
7281 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7282 }
7283 return val;
7284 }
7285
7286 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7287 doc: /* Encode a Japanese character CH to shift_jis encoding.
7288 Return the corresponding code in SJIS. */)
7289 (ch)
7290 Lisp_Object ch;
7291 {
7292 int charset, c1, c2, s1, s2;
7293 Lisp_Object val;
7294
7295 CHECK_NUMBER (ch);
7296 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7297 if (charset == CHARSET_ASCII)
7298 {
7299 val = ch;
7300 }
7301 else if (charset == charset_jisx0208
7302 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7303 {
7304 ENCODE_SJIS (c1, c2, s1, s2);
7305 XSETFASTINT (val, (s1 << 8) | s2);
7306 }
7307 else if (charset == charset_katakana_jisx0201
7308 && c1 > 0x20 && c2 < 0xE0)
7309 {
7310 XSETFASTINT (val, c1 | 0x80);
7311 }
7312 else
7313 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7314 return val;
7315 }
7316
7317 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7318 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7319 Return the corresponding character. */)
7320 (code)
7321 Lisp_Object code;
7322 {
7323 int charset;
7324 unsigned char b1, b2, c1, c2;
7325 Lisp_Object val;
7326
7327 CHECK_NUMBER (code);
7328 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7329 if (b1 == 0)
7330 {
7331 if (b2 >= 0x80)
7332 error ("Invalid BIG5 code: %x", XFASTINT (code));
7333 val = code;
7334 }
7335 else
7336 {
7337 if ((b1 < 0xA1 || b1 > 0xFE)
7338 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7339 error ("Invalid BIG5 code: %x", XFASTINT (code));
7340 DECODE_BIG5 (b1, b2, charset, c1, c2);
7341 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7342 }
7343 return val;
7344 }
7345
7346 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7347 doc: /* Encode the Big5 character CH to BIG5 coding system.
7348 Return the corresponding character code in Big5. */)
7349 (ch)
7350 Lisp_Object ch;
7351 {
7352 int charset, c1, c2, b1, b2;
7353 Lisp_Object val;
7354
7355 CHECK_NUMBER (ch);
7356 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7357 if (charset == CHARSET_ASCII)
7358 {
7359 val = ch;
7360 }
7361 else if ((charset == charset_big5_1
7362 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7363 || (charset == charset_big5_2
7364 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7365 {
7366 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7367 XSETFASTINT (val, (b1 << 8) | b2);
7368 }
7369 else
7370 error ("Can't encode to Big5: %d", XFASTINT (ch));
7371 return val;
7372 }
7373 \f
7374 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7375 Sset_terminal_coding_system_internal, 1, 2, 0,
7376 doc: /* Internal use only. */)
7377 (coding_system, terminal)
7378 Lisp_Object coding_system;
7379 Lisp_Object terminal;
7380 {
7381 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7382 CHECK_SYMBOL (coding_system);
7383 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7384 /* We had better not send unsafe characters to terminal. */
7385 terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7386 /* Character composition should be disabled. */
7387 terminal_coding->composing = COMPOSITION_DISABLED;
7388 /* Error notification should be suppressed. */
7389 terminal_coding->suppress_error = 1;
7390 terminal_coding->src_multibyte = 1;
7391 terminal_coding->dst_multibyte = 0;
7392 return Qnil;
7393 }
7394
7395 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7396 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7397 doc: /* Internal use only. */)
7398 (coding_system)
7399 Lisp_Object coding_system;
7400 {
7401 CHECK_SYMBOL (coding_system);
7402 setup_coding_system (Fcheck_coding_system (coding_system),
7403 &safe_terminal_coding);
7404 /* Character composition should be disabled. */
7405 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7406 /* Error notification should be suppressed. */
7407 safe_terminal_coding.suppress_error = 1;
7408 safe_terminal_coding.src_multibyte = 1;
7409 safe_terminal_coding.dst_multibyte = 0;
7410 return Qnil;
7411 }
7412
7413 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7414 Sterminal_coding_system, 0, 1, 0,
7415 doc: /* Return coding system specified for terminal output on the given terminal.
7416 TERMINAL may be a terminal id, a frame, or nil for the selected
7417 frame's terminal device. */)
7418 (terminal)
7419 Lisp_Object terminal;
7420 {
7421 return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7422 }
7423
7424 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7425 Sset_keyboard_coding_system_internal, 1, 2, 0,
7426 doc: /* Internal use only. */)
7427 (coding_system, terminal)
7428 Lisp_Object coding_system;
7429 Lisp_Object terminal;
7430 {
7431 struct terminal *t = get_terminal (terminal, 1);
7432 CHECK_SYMBOL (coding_system);
7433
7434 setup_coding_system (Fcheck_coding_system (coding_system),
7435 TERMINAL_KEYBOARD_CODING (t));
7436 /* Character composition should be disabled. */
7437 TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7438 return Qnil;
7439 }
7440
7441 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7442 Skeyboard_coding_system, 0, 1, 0,
7443 doc: /* Return coding system for decoding keyboard input on TERMINAL.
7444 TERMINAL may be a terminal id, a frame, or nil for the selected
7445 frame's terminal device. */)
7446 (terminal)
7447 Lisp_Object terminal;
7448 {
7449 return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7450 }
7451
7452 \f
7453 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7454 Sfind_operation_coding_system, 1, MANY, 0,
7455 doc: /* Choose a coding system for an operation based on the target name.
7456 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7457 DECODING-SYSTEM is the coding system to use for decoding
7458 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7459 for encoding (in case OPERATION does encoding).
7460
7461 The first argument OPERATION specifies an I/O primitive:
7462 For file I/O, `insert-file-contents' or `write-region'.
7463 For process I/O, `call-process', `call-process-region', or `start-process'.
7464 For network I/O, `open-network-stream'.
7465
7466 The remaining arguments should be the same arguments that were passed
7467 to the primitive. Depending on which primitive, one of those arguments
7468 is selected as the TARGET. For example, if OPERATION does file I/O,
7469 whichever argument specifies the file name is TARGET.
7470
7471 TARGET has a meaning which depends on OPERATION:
7472 For file I/O, TARGET is a file name (except for the special case below).
7473 For process I/O, TARGET is a process name.
7474 For network I/O, TARGET is a service name or a port number
7475
7476 This function looks up what specified for TARGET in,
7477 `file-coding-system-alist', `process-coding-system-alist',
7478 or `network-coding-system-alist' depending on OPERATION.
7479 They may specify a coding system, a cons of coding systems,
7480 or a function symbol to call.
7481 In the last case, we call the function with one argument,
7482 which is a list of all the arguments given to this function.
7483 If the function can't decide a coding system, it can return
7484 `undecided' so that the normal code-detection is performed.
7485
7486 If OPERATION is `insert-file-contents', the argument corresponding to
7487 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
7488 file name to look up, and BUFFER is a buffer that contains the file's
7489 contents (not yet decoded). If `file-coding-system-alist' specifies a
7490 function to call for FILENAME, that function should examine the
7491 contents of BUFFER instead of reading the file.
7492
7493 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
7494 (nargs, args)
7495 int nargs;
7496 Lisp_Object *args;
7497 {
7498 Lisp_Object operation, target_idx, target, val;
7499 register Lisp_Object chain;
7500
7501 if (nargs < 2)
7502 error ("Too few arguments");
7503 operation = args[0];
7504 if (!SYMBOLP (operation)
7505 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7506 error ("Invalid first argument");
7507 if (nargs < 1 + XINT (target_idx))
7508 error ("Too few arguments for operation: %s",
7509 SDATA (SYMBOL_NAME (operation)));
7510 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7511 argument to write-region) is string, it must be treated as a
7512 target file name. */
7513 if (EQ (operation, Qwrite_region)
7514 && nargs > 5
7515 && STRINGP (args[5]))
7516 target_idx = make_number (4);
7517 target = args[XINT (target_idx) + 1];
7518 if (!(STRINGP (target)
7519 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7520 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7521 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7522 error ("Invalid argument %d", XINT (target_idx) + 1);
7523 if (CONSP (target))
7524 target = XCAR (target);
7525
7526 chain = ((EQ (operation, Qinsert_file_contents)
7527 || EQ (operation, Qwrite_region))
7528 ? Vfile_coding_system_alist
7529 : (EQ (operation, Qopen_network_stream)
7530 ? Vnetwork_coding_system_alist
7531 : Vprocess_coding_system_alist));
7532 if (NILP (chain))
7533 return Qnil;
7534
7535 for (; CONSP (chain); chain = XCDR (chain))
7536 {
7537 Lisp_Object elt;
7538 elt = XCAR (chain);
7539
7540 if (CONSP (elt)
7541 && ((STRINGP (target)
7542 && STRINGP (XCAR (elt))
7543 && fast_string_match (XCAR (elt), target) >= 0)
7544 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7545 {
7546 val = XCDR (elt);
7547 /* Here, if VAL is both a valid coding system and a valid
7548 function symbol, we return VAL as a coding system. */
7549 if (CONSP (val))
7550 return val;
7551 if (! SYMBOLP (val))
7552 return Qnil;
7553 if (! NILP (Fcoding_system_p (val)))
7554 return Fcons (val, val);
7555 if (! NILP (Ffboundp (val)))
7556 {
7557 /* We use call1 rather than safe_call1
7558 so as to get bug reports about functions called here
7559 which don't handle the current interface. */
7560 val = call1 (val, Flist (nargs, args));
7561 if (CONSP (val))
7562 return val;
7563 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7564 return Fcons (val, val);
7565 }
7566 return Qnil;
7567 }
7568 }
7569 return Qnil;
7570 }
7571
7572 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7573 Supdate_coding_systems_internal, 0, 0, 0,
7574 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7575 When values of any coding categories are changed, you must
7576 call this function. */)
7577 ()
7578 {
7579 int i;
7580
7581 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7582 {
7583 Lisp_Object val;
7584
7585 val = find_symbol_value (XVECTOR (Vcoding_category_table)->contents[i]);
7586 if (!NILP (val))
7587 {
7588 if (! coding_system_table[i])
7589 coding_system_table[i] = ((struct coding_system *)
7590 xmalloc (sizeof (struct coding_system)));
7591 setup_coding_system (val, coding_system_table[i]);
7592 }
7593 else if (coding_system_table[i])
7594 {
7595 xfree (coding_system_table[i]);
7596 coding_system_table[i] = NULL;
7597 }
7598 }
7599
7600 return Qnil;
7601 }
7602
7603 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7604 Sset_coding_priority_internal, 0, 0, 0,
7605 doc: /* Update internal database for the current value of `coding-category-list'.
7606 This function is internal use only. */)
7607 ()
7608 {
7609 int i = 0, idx;
7610 Lisp_Object val;
7611
7612 val = Vcoding_category_list;
7613
7614 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7615 {
7616 if (! SYMBOLP (XCAR (val)))
7617 break;
7618 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7619 if (idx >= CODING_CATEGORY_IDX_MAX)
7620 break;
7621 coding_priorities[i++] = (1 << idx);
7622 val = XCDR (val);
7623 }
7624 /* If coding-category-list is valid and contains all coding
7625 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7626 the following code saves Emacs from crashing. */
7627 while (i < CODING_CATEGORY_IDX_MAX)
7628 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7629
7630 return Qnil;
7631 }
7632
7633 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7634 Sdefine_coding_system_internal, 1, 1, 0,
7635 doc: /* Register CODING-SYSTEM as a base coding system.
7636 This function is internal use only. */)
7637 (coding_system)
7638 Lisp_Object coding_system;
7639 {
7640 Lisp_Object safe_chars, slot;
7641
7642 if (NILP (Fcheck_coding_system (coding_system)))
7643 xsignal1 (Qcoding_system_error, coding_system);
7644
7645 safe_chars = coding_safe_chars (coding_system);
7646 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7647 error ("No valid safe-chars property for %s",
7648 SDATA (SYMBOL_NAME (coding_system)));
7649
7650 if (EQ (safe_chars, Qt))
7651 {
7652 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7653 XSETCAR (Vcoding_system_safe_chars,
7654 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7655 }
7656 else
7657 {
7658 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7659 if (NILP (slot))
7660 XSETCDR (Vcoding_system_safe_chars,
7661 nconc2 (XCDR (Vcoding_system_safe_chars),
7662 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7663 else
7664 XSETCDR (slot, safe_chars);
7665 }
7666 return Qnil;
7667 }
7668
7669 #endif /* emacs */
7670
7671 \f
7672 /*** 9. Post-amble ***/
7673
7674 void
7675 init_coding_once ()
7676 {
7677 int i;
7678
7679 /* Emacs' internal format specific initialize routine. */
7680 for (i = 0; i <= 0x20; i++)
7681 emacs_code_class[i] = EMACS_control_code;
7682 emacs_code_class[0x0A] = EMACS_linefeed_code;
7683 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7684 for (i = 0x21 ; i < 0x7F; i++)
7685 emacs_code_class[i] = EMACS_ascii_code;
7686 emacs_code_class[0x7F] = EMACS_control_code;
7687 for (i = 0x80; i < 0xFF; i++)
7688 emacs_code_class[i] = EMACS_invalid_code;
7689 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7690 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7691 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7692 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7693
7694 /* ISO2022 specific initialize routine. */
7695 for (i = 0; i < 0x20; i++)
7696 iso_code_class[i] = ISO_control_0;
7697 for (i = 0x21; i < 0x7F; i++)
7698 iso_code_class[i] = ISO_graphic_plane_0;
7699 for (i = 0x80; i < 0xA0; i++)
7700 iso_code_class[i] = ISO_control_1;
7701 for (i = 0xA1; i < 0xFF; i++)
7702 iso_code_class[i] = ISO_graphic_plane_1;
7703 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7704 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7705 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7706 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7707 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7708 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7709 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7710 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7711 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7712 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7713
7714 setup_coding_system (Qnil, &safe_terminal_coding);
7715 setup_coding_system (Qnil, &default_buffer_file_coding);
7716
7717 bzero (coding_system_table, sizeof coding_system_table);
7718
7719 bzero (ascii_skip_code, sizeof ascii_skip_code);
7720 for (i = 0; i < 128; i++)
7721 ascii_skip_code[i] = 1;
7722
7723 #if defined (MSDOS) || defined (WINDOWSNT)
7724 system_eol_type = CODING_EOL_CRLF;
7725 #else
7726 system_eol_type = CODING_EOL_LF;
7727 #endif
7728
7729 inhibit_pre_post_conversion = 0;
7730 }
7731
7732 #ifdef emacs
7733
7734 void
7735 syms_of_coding ()
7736 {
7737 staticpro (&Vcode_conversion_workbuf_name);
7738 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7739
7740 Qtarget_idx = intern ("target-idx");
7741 staticpro (&Qtarget_idx);
7742
7743 Qcoding_system_history = intern ("coding-system-history");
7744 staticpro (&Qcoding_system_history);
7745 Fset (Qcoding_system_history, Qnil);
7746
7747 /* Target FILENAME is the first argument. */
7748 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7749 /* Target FILENAME is the third argument. */
7750 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7751
7752 Qcall_process = intern ("call-process");
7753 staticpro (&Qcall_process);
7754 /* Target PROGRAM is the first argument. */
7755 Fput (Qcall_process, Qtarget_idx, make_number (0));
7756
7757 Qcall_process_region = intern ("call-process-region");
7758 staticpro (&Qcall_process_region);
7759 /* Target PROGRAM is the third argument. */
7760 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7761
7762 Qstart_process = intern ("start-process");
7763 staticpro (&Qstart_process);
7764 /* Target PROGRAM is the third argument. */
7765 Fput (Qstart_process, Qtarget_idx, make_number (2));
7766
7767 Qopen_network_stream = intern ("open-network-stream");
7768 staticpro (&Qopen_network_stream);
7769 /* Target SERVICE is the fourth argument. */
7770 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7771
7772 Qcoding_system = intern ("coding-system");
7773 staticpro (&Qcoding_system);
7774
7775 Qeol_type = intern ("eol-type");
7776 staticpro (&Qeol_type);
7777
7778 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7779 staticpro (&Qbuffer_file_coding_system);
7780
7781 Qpost_read_conversion = intern ("post-read-conversion");
7782 staticpro (&Qpost_read_conversion);
7783
7784 Qpre_write_conversion = intern ("pre-write-conversion");
7785 staticpro (&Qpre_write_conversion);
7786
7787 Qno_conversion = intern ("no-conversion");
7788 staticpro (&Qno_conversion);
7789
7790 Qundecided = intern ("undecided");
7791 staticpro (&Qundecided);
7792
7793 Qcoding_system_p = intern ("coding-system-p");
7794 staticpro (&Qcoding_system_p);
7795
7796 Qcoding_system_error = intern ("coding-system-error");
7797 staticpro (&Qcoding_system_error);
7798
7799 Fput (Qcoding_system_error, Qerror_conditions,
7800 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7801 Fput (Qcoding_system_error, Qerror_message,
7802 build_string ("Invalid coding system"));
7803
7804 Qcoding_category = intern ("coding-category");
7805 staticpro (&Qcoding_category);
7806 Qcoding_category_index = intern ("coding-category-index");
7807 staticpro (&Qcoding_category_index);
7808
7809 Vcoding_category_table
7810 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7811 staticpro (&Vcoding_category_table);
7812 {
7813 int i;
7814 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7815 {
7816 XVECTOR (Vcoding_category_table)->contents[i]
7817 = intern (coding_category_name[i]);
7818 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7819 Qcoding_category_index, make_number (i));
7820 }
7821 }
7822
7823 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7824 staticpro (&Vcoding_system_safe_chars);
7825
7826 Qtranslation_table = intern ("translation-table");
7827 staticpro (&Qtranslation_table);
7828 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7829
7830 Qtranslation_table_id = intern ("translation-table-id");
7831 staticpro (&Qtranslation_table_id);
7832
7833 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7834 staticpro (&Qtranslation_table_for_decode);
7835
7836 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7837 staticpro (&Qtranslation_table_for_encode);
7838
7839 Qsafe_chars = intern ("safe-chars");
7840 staticpro (&Qsafe_chars);
7841
7842 Qchar_coding_system = intern ("char-coding-system");
7843 staticpro (&Qchar_coding_system);
7844
7845 /* Intern this now in case it isn't already done.
7846 Setting this variable twice is harmless.
7847 But don't staticpro it here--that is done in alloc.c. */
7848 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7849 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7850 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7851
7852 Qvalid_codes = intern ("valid-codes");
7853 staticpro (&Qvalid_codes);
7854
7855 Qascii_incompatible = intern ("ascii-incompatible");
7856 staticpro (&Qascii_incompatible);
7857
7858 Qemacs_mule = intern ("emacs-mule");
7859 staticpro (&Qemacs_mule);
7860
7861 Qraw_text = intern ("raw-text");
7862 staticpro (&Qraw_text);
7863
7864 Qutf_8 = intern ("utf-8");
7865 staticpro (&Qutf_8);
7866
7867 Qcoding_system_define_form = intern ("coding-system-define-form");
7868 staticpro (&Qcoding_system_define_form);
7869
7870 defsubr (&Scoding_system_p);
7871 defsubr (&Sread_coding_system);
7872 defsubr (&Sread_non_nil_coding_system);
7873 defsubr (&Scheck_coding_system);
7874 defsubr (&Sdetect_coding_region);
7875 defsubr (&Sdetect_coding_string);
7876 defsubr (&Sfind_coding_systems_region_internal);
7877 defsubr (&Sunencodable_char_position);
7878 defsubr (&Sdecode_coding_region);
7879 defsubr (&Sencode_coding_region);
7880 defsubr (&Sdecode_coding_string);
7881 defsubr (&Sencode_coding_string);
7882 defsubr (&Sdecode_sjis_char);
7883 defsubr (&Sencode_sjis_char);
7884 defsubr (&Sdecode_big5_char);
7885 defsubr (&Sencode_big5_char);
7886 defsubr (&Sset_terminal_coding_system_internal);
7887 defsubr (&Sset_safe_terminal_coding_system_internal);
7888 defsubr (&Sterminal_coding_system);
7889 defsubr (&Sset_keyboard_coding_system_internal);
7890 defsubr (&Skeyboard_coding_system);
7891 defsubr (&Sfind_operation_coding_system);
7892 defsubr (&Supdate_coding_systems_internal);
7893 defsubr (&Sset_coding_priority_internal);
7894 defsubr (&Sdefine_coding_system_internal);
7895
7896 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7897 doc: /* List of coding systems.
7898
7899 Do not alter the value of this variable manually. This variable should be
7900 updated by the functions `make-coding-system' and
7901 `define-coding-system-alias'. */);
7902 Vcoding_system_list = Qnil;
7903
7904 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7905 doc: /* Alist of coding system names.
7906 Each element is one element list of coding system name.
7907 This variable is given to `completing-read' as TABLE argument.
7908
7909 Do not alter the value of this variable manually. This variable should be
7910 updated by the functions `make-coding-system' and
7911 `define-coding-system-alias'. */);
7912 Vcoding_system_alist = Qnil;
7913
7914 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7915 doc: /* List of coding-categories (symbols) ordered by priority.
7916
7917 On detecting a coding system, Emacs tries code detection algorithms
7918 associated with each coding-category one by one in this order. When
7919 one algorithm agrees with a byte sequence of source text, the coding
7920 system bound to the corresponding coding-category is selected.
7921
7922 Don't modify this variable directly, but use `set-coding-priority'. */);
7923 {
7924 int i;
7925
7926 Vcoding_category_list = Qnil;
7927 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7928 Vcoding_category_list
7929 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7930 Vcoding_category_list);
7931 }
7932
7933 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7934 doc: /* Specify the coding system for read operations.
7935 It is useful to bind this variable with `let', but do not set it globally.
7936 If the value is a coding system, it is used for decoding on read operation.
7937 If not, an appropriate element is used from one of the coding system alists:
7938 There are three such tables, `file-coding-system-alist',
7939 `process-coding-system-alist', and `network-coding-system-alist'. */);
7940 Vcoding_system_for_read = Qnil;
7941
7942 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7943 doc: /* Specify the coding system for write operations.
7944 Programs bind this variable with `let', but you should not set it globally.
7945 If the value is a coding system, it is used for encoding of output,
7946 when writing it to a file and when sending it to a file or subprocess.
7947
7948 If this does not specify a coding system, an appropriate element
7949 is used from one of the coding system alists:
7950 There are three such tables, `file-coding-system-alist',
7951 `process-coding-system-alist', and `network-coding-system-alist'.
7952 For output to files, if the above procedure does not specify a coding system,
7953 the value of `buffer-file-coding-system' is used. */);
7954 Vcoding_system_for_write = Qnil;
7955
7956 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7957 doc: /* Coding system used in the latest file or process I/O.
7958 Also set by `encode-coding-region', `decode-coding-region',
7959 `encode-coding-string' and `decode-coding-string'. */);
7960 Vlast_coding_system_used = Qnil;
7961
7962 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7963 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7964 See info node `Coding Systems' and info node `Text and Binary' concerning
7965 such conversion. */);
7966 inhibit_eol_conversion = 0;
7967
7968 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7969 doc: /* Non-nil means process buffer inherits coding system of process output.
7970 Bind it to t if the process output is to be treated as if it were a file
7971 read from some filesystem. */);
7972 inherit_process_coding_system = 0;
7973
7974 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7975 doc: /* Alist to decide a coding system to use for a file I/O operation.
7976 The format is ((PATTERN . VAL) ...),
7977 where PATTERN is a regular expression matching a file name,
7978 VAL is a coding system, a cons of coding systems, or a function symbol.
7979 If VAL is a coding system, it is used for both decoding and encoding
7980 the file contents.
7981 If VAL is a cons of coding systems, the car part is used for decoding,
7982 and the cdr part is used for encoding.
7983 If VAL is a function symbol, the function must return a coding system
7984 or a cons of coding systems which are used as above. The function is
7985 called with an argument that is a list of the arguments with which
7986 `find-operation-coding-system' was called. If the function can't decide
7987 a coding system, it can return `undecided' so that the normal
7988 code-detection is performed.
7989
7990 See also the function `find-operation-coding-system'
7991 and the variable `auto-coding-alist'. */);
7992 Vfile_coding_system_alist = Qnil;
7993
7994 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7995 doc: /* Alist to decide a coding system to use for a process I/O operation.
7996 The format is ((PATTERN . VAL) ...),
7997 where PATTERN is a regular expression matching a program name,
7998 VAL is a coding system, a cons of coding systems, or a function symbol.
7999 If VAL is a coding system, it is used for both decoding what received
8000 from the program and encoding what sent to the program.
8001 If VAL is a cons of coding systems, the car part is used for decoding,
8002 and the cdr part is used for encoding.
8003 If VAL is a function symbol, the function must return a coding system
8004 or a cons of coding systems which are used as above.
8005
8006 See also the function `find-operation-coding-system'. */);
8007 Vprocess_coding_system_alist = Qnil;
8008
8009 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8010 doc: /* Alist to decide a coding system to use for a network I/O operation.
8011 The format is ((PATTERN . VAL) ...),
8012 where PATTERN is a regular expression matching a network service name
8013 or is a port number to connect to,
8014 VAL is a coding system, a cons of coding systems, or a function symbol.
8015 If VAL is a coding system, it is used for both decoding what received
8016 from the network stream and encoding what sent to the network stream.
8017 If VAL is a cons of coding systems, the car part is used for decoding,
8018 and the cdr part is used for encoding.
8019 If VAL is a function symbol, the function must return a coding system
8020 or a cons of coding systems which are used as above.
8021
8022 See also the function `find-operation-coding-system'. */);
8023 Vnetwork_coding_system_alist = Qnil;
8024
8025 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8026 doc: /* Coding system to use with system messages.
8027 Also used for decoding keyboard input on X Window system. */);
8028 Vlocale_coding_system = Qnil;
8029
8030 /* The eol mnemonics are reset in startup.el system-dependently. */
8031 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8032 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8033 eol_mnemonic_unix = build_string (":");
8034
8035 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8036 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8037 eol_mnemonic_dos = build_string ("\\");
8038
8039 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8040 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8041 eol_mnemonic_mac = build_string ("/");
8042
8043 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8044 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
8045 eol_mnemonic_undecided = build_string (":");
8046
8047 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8048 doc: /* *Non-nil enables character translation while encoding and decoding. */);
8049 Venable_character_translation = Qt;
8050
8051 DEFVAR_LISP ("standard-translation-table-for-decode",
8052 &Vstandard_translation_table_for_decode,
8053 doc: /* Table for translating characters while decoding. */);
8054 Vstandard_translation_table_for_decode = Qnil;
8055
8056 DEFVAR_LISP ("standard-translation-table-for-encode",
8057 &Vstandard_translation_table_for_encode,
8058 doc: /* Table for translating characters while encoding. */);
8059 Vstandard_translation_table_for_encode = Qnil;
8060
8061 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8062 doc: /* Alist of charsets vs revision numbers.
8063 While encoding, if a charset (car part of an element) is found,
8064 designate it with the escape sequence identifying revision (cdr part of the element). */);
8065 Vcharset_revision_alist = Qnil;
8066
8067 DEFVAR_LISP ("default-process-coding-system",
8068 &Vdefault_process_coding_system,
8069 doc: /* Cons of coding systems used for process I/O by default.
8070 The car part is used for decoding a process output,
8071 the cdr part is used for encoding a text to be sent to a process. */);
8072 Vdefault_process_coding_system = Qnil;
8073
8074 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8075 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8076 This is a vector of length 256.
8077 If Nth element is non-nil, the existence of code N in a file
8078 \(or output of subprocess) doesn't prevent it to be detected as
8079 a coding system of ISO 2022 variant which has a flag
8080 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8081 or reading output of a subprocess.
8082 Only 128th through 159th elements has a meaning. */);
8083 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8084
8085 DEFVAR_LISP ("select-safe-coding-system-function",
8086 &Vselect_safe_coding_system_function,
8087 doc: /* Function to call to select safe coding system for encoding a text.
8088
8089 If set, this function is called to force a user to select a proper
8090 coding system which can encode the text in the case that a default
8091 coding system used in each operation can't encode the text.
8092
8093 The default value is `select-safe-coding-system' (which see). */);
8094 Vselect_safe_coding_system_function = Qnil;
8095
8096 DEFVAR_BOOL ("coding-system-require-warning",
8097 &coding_system_require_warning,
8098 doc: /* Internal use only.
8099 If non-nil, on writing a file, `select-safe-coding-system-function' is
8100 called even if `coding-system-for-write' is non-nil. The command
8101 `universal-coding-system-argument' binds this variable to t temporarily. */);
8102 coding_system_require_warning = 0;
8103
8104
8105 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8106 &inhibit_iso_escape_detection,
8107 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8108
8109 By default, on reading a file, Emacs tries to detect how the text is
8110 encoded. This code detection is sensitive to escape sequences. If
8111 the sequence is valid as ISO2022, the code is determined as one of
8112 the ISO2022 encodings, and the file is decoded by the corresponding
8113 coding system (e.g. `iso-2022-7bit').
8114
8115 However, there may be a case that you want to read escape sequences in
8116 a file as is. In such a case, you can set this variable to non-nil.
8117 Then, as the code detection ignores any escape sequences, no file is
8118 detected as encoded in some ISO2022 encoding. The result is that all
8119 escape sequences become visible in a buffer.
8120
8121 The default value is nil, and it is strongly recommended not to change
8122 it. That is because many Emacs Lisp source files that contain
8123 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8124 in Emacs's distribution, and they won't be decoded correctly on
8125 reading if you suppress escape sequence detection.
8126
8127 The other way to read escape sequences in a file without decoding is
8128 to explicitly specify some coding system that doesn't use ISO2022's
8129 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8130 inhibit_iso_escape_detection = 0;
8131
8132 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8133 doc: /* Char table for translating self-inserting characters.
8134 This is applied to the result of input methods, not their input. See also
8135 `keyboard-translate-table'. */);
8136 Vtranslation_table_for_input = Qnil;
8137 }
8138
8139 char *
8140 emacs_strerror (error_number)
8141 int error_number;
8142 {
8143 char *str;
8144
8145 synchronize_system_messages_locale ();
8146 str = strerror (error_number);
8147
8148 if (! NILP (Vlocale_coding_system))
8149 {
8150 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8151 Vlocale_coding_system,
8152 0);
8153 str = (char *) SDATA (dec);
8154 }
8155
8156 return str;
8157 }
8158
8159 #endif /* emacs */
8160
8161 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8162 (do not change this comment) */