]> code.delx.au - gnu-emacs/blob - src/coding.c
Merged in changes from CVS trunk.
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
5
6 This file is part of GNU Emacs.
7
8 GNU Emacs is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU Emacs is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GNU Emacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /*** TABLE OF CONTENTS ***
24
25 0. General comments
26 1. Preamble
27 2. Emacs' internal format (emacs-mule) handlers
28 3. ISO2022 handlers
29 4. Shift-JIS and BIG5 handlers
30 5. CCL handlers
31 6. End-of-line handlers
32 7. C library functions
33 8. Emacs Lisp library functions
34 9. Post-amble
35
36 */
37
38 /*** 0. General comments ***/
39
40
41 /*** GENERAL NOTE on CODING SYSTEMS ***
42
43 A coding system is an encoding mechanism for one or more character
44 sets. Here's a list of coding systems which Emacs can handle. When
45 we say "decode", it means converting some other coding system to
46 Emacs' internal format (emacs-mule), and when we say "encode",
47 it means converting the coding system emacs-mule to some other
48 coding system.
49
50 0. Emacs' internal format (emacs-mule)
51
52 Emacs itself holds a multi-lingual character in buffers and strings
53 in a special format. Details are described in section 2.
54
55 1. ISO2022
56
57 The most famous coding system for multiple character sets. X's
58 Compound Text, various EUCs (Extended Unix Code), and coding
59 systems used in Internet communication such as ISO-2022-JP are
60 all variants of ISO2022. Details are described in section 3.
61
62 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63
64 A coding system to encode character sets: ASCII, JISX0201, and
65 JISX0208. Widely used for PC's in Japan. Details are described in
66 section 4.
67
68 3. BIG5
69
70 A coding system to encode the character sets ASCII and Big5. Widely
71 used for Chinese (mainly in Taiwan and Hong Kong). Details are
72 described in section 4. In this file, when we write "BIG5"
73 (all uppercase), we mean the coding system, and when we write
74 "Big5" (capitalized), we mean the character set.
75
76 4. Raw text
77
78 A coding system for text containing random 8-bit code. Emacs does
79 no code conversion on such text except for end-of-line format.
80
81 5. Other
82
83 If a user wants to read/write text encoded in a coding system not
84 listed above, he can supply a decoder and an encoder for it as CCL
85 (Code Conversion Language) programs. Emacs executes the CCL program
86 while reading/writing.
87
88 Emacs represents a coding system by a Lisp symbol that has a property
89 `coding-system'. But, before actually using the coding system, the
90 information about it is set in a structure of type `struct
91 coding_system' for rapid processing. See section 6 for more details.
92
93 */
94
95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96
97 How end-of-line of text is encoded depends on the operating system.
98 For instance, Unix's format is just one byte of `line-feed' code,
99 whereas DOS's format is two-byte sequence of `carriage-return' and
100 `line-feed' codes. MacOS's format is usually one byte of
101 `carriage-return'.
102
103 Since text character encoding and end-of-line encoding are
104 independent, any coding system described above can have any
105 end-of-line format. So Emacs has information about end-of-line
106 format in each coding-system. See section 6 for more details.
107
108 */
109
110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111
112 These functions check if a text between SRC and SRC_END is encoded
113 in the coding system category XXX. Each returns an integer value in
114 which appropriate flag bits for the category XXX are set. The flag
115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes
117 of the range 0x80..0x9F are in multibyte form. */
118 #if 0
119 int
120 detect_coding_emacs_mule (src, src_end, multibytep)
121 unsigned char *src, *src_end;
122 int multibytep;
123 {
124 ...
125 }
126 #endif
127
128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
129
130 These functions decode SRC_BYTES length of unibyte text at SOURCE
131 encoded in CODING to Emacs' internal format. The resulting
132 multibyte text goes to a place pointed to by DESTINATION, the length
133 of which should not exceed DST_BYTES.
134
135 These functions set the information about original and decoded texts
136 in the members `produced', `produced_char', `consumed', and
137 `consumed_char' of the structure *CODING. They also set the member
138 `result' to one of CODING_FINISH_XXX indicating how the decoding
139 finished.
140
141 DST_BYTES zero means that the source area and destination area are
142 overlapped, which means that we can produce a decoded text until it
143 reaches the head of the not-yet-decoded source text.
144
145 Below is a template for these functions. */
146 #if 0
147 static void
148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 {
153 ...
154 }
155 #endif
156
157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
158
159 These functions encode SRC_BYTES length text at SOURCE from Emacs'
160 internal multibyte format to CODING. The resulting unibyte text
161 goes to a place pointed to by DESTINATION, the length of which
162 should not exceed DST_BYTES.
163
164 These functions set the information about original and encoded texts
165 in the members `produced', `produced_char', `consumed', and
166 `consumed_char' of the structure *CODING. They also set the member
167 `result' to one of CODING_FINISH_XXX indicating how the encoding
168 finished.
169
170 DST_BYTES zero means that the source area and destination area are
171 overlapped, which means that we can produce encoded text until it
172 reaches at the head of the not-yet-encoded source text.
173
174 Below is a template for these functions. */
175 #if 0
176 static void
177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
178 struct coding_system *coding;
179 unsigned char *source, *destination;
180 int src_bytes, dst_bytes;
181 {
182 ...
183 }
184 #endif
185
186 /*** COMMONLY USED MACROS ***/
187
188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
189 get one, two, and three bytes from the source text respectively.
190 If there are not enough bytes in the source, they jump to
191 `label_end_of_loop'. The caller should set variables `coding',
192 `src' and `src_end' to appropriate pointer in advance. These
193 macros are called from decoding routines `decode_coding_XXX', thus
194 it is assumed that the source text is unibyte. */
195
196 #define ONE_MORE_BYTE(c1) \
197 do { \
198 if (src >= src_end) \
199 { \
200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
201 goto label_end_of_loop; \
202 } \
203 c1 = *src++; \
204 } while (0)
205
206 #define TWO_MORE_BYTES(c1, c2) \
207 do { \
208 if (src + 1 >= src_end) \
209 { \
210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
211 goto label_end_of_loop; \
212 } \
213 c1 = *src++; \
214 c2 = *src++; \
215 } while (0)
216
217
218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
219 form if MULTIBYTEP is nonzero. */
220
221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \
222 do { \
223 if (src >= src_end) \
224 { \
225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
226 goto label_end_of_loop; \
227 } \
228 c1 = *src++; \
229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \
230 c1 = *src++ - 0x20; \
231 } while (0)
232
233 /* Set C to the next character at the source text pointed by `src'.
234 If there are not enough characters in the source, jump to
235 `label_end_of_loop'. The caller should set variables `coding'
236 `src', `src_end', and `translation_table' to appropriate pointers
237 in advance. This macro is used in encoding routines
238 `encode_coding_XXX', thus it assumes that the source text is in
239 multibyte form except for 8-bit characters. 8-bit characters are
240 in multibyte form if coding->src_multibyte is nonzero, else they
241 are represented by a single byte. */
242
243 #define ONE_MORE_CHAR(c) \
244 do { \
245 int len = src_end - src; \
246 int bytes; \
247 if (len <= 0) \
248 { \
249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
250 goto label_end_of_loop; \
251 } \
252 if (coding->src_multibyte \
253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
255 else \
256 c = *src, bytes = 1; \
257 if (!NILP (translation_table)) \
258 c = translate_char (translation_table, c, -1, 0, 0); \
259 src += bytes; \
260 } while (0)
261
262
263 /* Produce a multibyte form of character C to `dst'. Jump to
264 `label_end_of_loop' if there's not enough space at `dst'.
265
266 If we are now in the middle of a composition sequence, the decoded
267 character may be ALTCHAR (for the current composition). In that
268 case, the character goes to coding->cmp_data->data instead of
269 `dst'.
270
271 This macro is used in decoding routines. */
272
273 #define EMIT_CHAR(c) \
274 do { \
275 if (! COMPOSING_P (coding) \
276 || coding->composing == COMPOSITION_RELATIVE \
277 || coding->composing == COMPOSITION_WITH_RULE) \
278 { \
279 int bytes = CHAR_BYTES (c); \
280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
281 { \
282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
283 goto label_end_of_loop; \
284 } \
285 dst += CHAR_STRING (c, dst); \
286 coding->produced_char++; \
287 } \
288 \
289 if (COMPOSING_P (coding) \
290 && coding->composing != COMPOSITION_RELATIVE) \
291 { \
292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
293 coding->composition_rule_follows \
294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
295 } \
296 } while (0)
297
298
299 #define EMIT_ONE_BYTE(c) \
300 do { \
301 if (dst >= (dst_bytes ? dst_end : src)) \
302 { \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
305 } \
306 *dst++ = c; \
307 } while (0)
308
309 #define EMIT_TWO_BYTES(c1, c2) \
310 do { \
311 if (dst + 2 > (dst_bytes ? dst_end : src)) \
312 { \
313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
314 goto label_end_of_loop; \
315 } \
316 *dst++ = c1, *dst++ = c2; \
317 } while (0)
318
319 #define EMIT_BYTES(from, to) \
320 do { \
321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
322 { \
323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
324 goto label_end_of_loop; \
325 } \
326 while (from < to) \
327 *dst++ = *from++; \
328 } while (0)
329
330 \f
331 /*** 1. Preamble ***/
332
333 #ifdef emacs
334 #include <config.h>
335 #endif
336
337 #include <stdio.h>
338
339 #ifdef emacs
340
341 #include "lisp.h"
342 #include "buffer.h"
343 #include "charset.h"
344 #include "composite.h"
345 #include "ccl.h"
346 #include "coding.h"
347 #include "window.h"
348 #include "intervals.h"
349 #include "frame.h"
350 #include "termhooks.h"
351
352 #else /* not emacs */
353
354 #include "mulelib.h"
355
356 #endif /* not emacs */
357
358 Lisp_Object Qcoding_system, Qeol_type;
359 Lisp_Object Qbuffer_file_coding_system;
360 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
361 Lisp_Object Qno_conversion, Qundecided;
362 Lisp_Object Qcoding_system_history;
363 Lisp_Object Qsafe_chars;
364 Lisp_Object Qvalid_codes;
365
366 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
367 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
368 Lisp_Object Qstart_process, Qopen_network_stream;
369 Lisp_Object Qtarget_idx;
370
371 /* If a symbol has this property, evaluate the value to define the
372 symbol as a coding system. */
373 Lisp_Object Qcoding_system_define_form;
374
375 Lisp_Object Vselect_safe_coding_system_function;
376
377 int coding_system_require_warning;
378
379 /* Mnemonic string for each format of end-of-line. */
380 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
381 /* Mnemonic string to indicate format of end-of-line is not yet
382 decided. */
383 Lisp_Object eol_mnemonic_undecided;
384
385 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
386 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
387 int system_eol_type;
388
389 #ifdef emacs
390
391 /* Information about which coding system is safe for which chars.
392 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
393
394 GENERIC-LIST is a list of generic coding systems which can encode
395 any characters.
396
397 NON-GENERIC-ALIST is an alist of non generic coding systems vs the
398 corresponding char table that contains safe chars. */
399 Lisp_Object Vcoding_system_safe_chars;
400
401 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
402
403 Lisp_Object Qcoding_system_p, Qcoding_system_error;
404
405 /* Coding system emacs-mule and raw-text are for converting only
406 end-of-line format. */
407 Lisp_Object Qemacs_mule, Qraw_text;
408
409 Lisp_Object Qutf_8;
410
411 /* Coding-systems are handed between Emacs Lisp programs and C internal
412 routines by the following three variables. */
413 /* Coding-system for reading files and receiving data from process. */
414 Lisp_Object Vcoding_system_for_read;
415 /* Coding-system for writing files and sending data to process. */
416 Lisp_Object Vcoding_system_for_write;
417 /* Coding-system actually used in the latest I/O. */
418 Lisp_Object Vlast_coding_system_used;
419
420 /* A vector of length 256 which contains information about special
421 Latin codes (especially for dealing with Microsoft codes). */
422 Lisp_Object Vlatin_extra_code_table;
423
424 /* Flag to inhibit code conversion of end-of-line format. */
425 int inhibit_eol_conversion;
426
427 /* Flag to inhibit ISO2022 escape sequence detection. */
428 int inhibit_iso_escape_detection;
429
430 /* Flag to make buffer-file-coding-system inherit from process-coding. */
431 int inherit_process_coding_system;
432
433 /* Coding system to be used to encode text for terminal display when
434 terminal coding system is nil. */
435 struct coding_system safe_terminal_coding;
436
437 /* Default coding system to be used to write a file. */
438 struct coding_system default_buffer_file_coding;
439
440 Lisp_Object Vfile_coding_system_alist;
441 Lisp_Object Vprocess_coding_system_alist;
442 Lisp_Object Vnetwork_coding_system_alist;
443
444 Lisp_Object Vlocale_coding_system;
445
446 #endif /* emacs */
447
448 Lisp_Object Qcoding_category, Qcoding_category_index;
449
450 /* List of symbols `coding-category-xxx' ordered by priority. */
451 Lisp_Object Vcoding_category_list;
452
453 /* Table of coding categories (Lisp symbols). */
454 Lisp_Object Vcoding_category_table;
455
456 /* Table of names of symbol for each coding-category. */
457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
458 "coding-category-emacs-mule",
459 "coding-category-sjis",
460 "coding-category-iso-7",
461 "coding-category-iso-7-tight",
462 "coding-category-iso-8-1",
463 "coding-category-iso-8-2",
464 "coding-category-iso-7-else",
465 "coding-category-iso-8-else",
466 "coding-category-ccl",
467 "coding-category-big5",
468 "coding-category-utf-8",
469 "coding-category-utf-16-be",
470 "coding-category-utf-16-le",
471 "coding-category-raw-text",
472 "coding-category-binary"
473 };
474
475 /* Table of pointers to coding systems corresponding to each coding
476 categories. */
477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
478
479 /* Table of coding category masks. Nth element is a mask for a coding
480 category of which priority is Nth. */
481 static
482 int coding_priorities[CODING_CATEGORY_IDX_MAX];
483
484 /* Flag to tell if we look up translation table on character code
485 conversion. */
486 Lisp_Object Venable_character_translation;
487 /* Standard translation table to look up on decoding (reading). */
488 Lisp_Object Vstandard_translation_table_for_decode;
489 /* Standard translation table to look up on encoding (writing). */
490 Lisp_Object Vstandard_translation_table_for_encode;
491
492 Lisp_Object Qtranslation_table;
493 Lisp_Object Qtranslation_table_id;
494 Lisp_Object Qtranslation_table_for_decode;
495 Lisp_Object Qtranslation_table_for_encode;
496
497 /* Alist of charsets vs revision number. */
498 Lisp_Object Vcharset_revision_alist;
499
500 /* Default coding systems used for process I/O. */
501 Lisp_Object Vdefault_process_coding_system;
502
503 /* Char table for translating Quail and self-inserting input. */
504 Lisp_Object Vtranslation_table_for_input;
505
506 /* Global flag to tell that we can't call post-read-conversion and
507 pre-write-conversion functions. Usually the value is zero, but it
508 is set to 1 temporarily while such functions are running. This is
509 to avoid infinite recursive call. */
510 static int inhibit_pre_post_conversion;
511
512 Lisp_Object Qchar_coding_system;
513
514 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check
515 its validity. */
516
517 Lisp_Object
518 coding_safe_chars (coding_system)
519 Lisp_Object coding_system;
520 {
521 Lisp_Object coding_spec, plist, safe_chars;
522
523 coding_spec = Fget (coding_system, Qcoding_system);
524 plist = XVECTOR (coding_spec)->contents[3];
525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
527 }
528
529 #define CODING_SAFE_CHAR_P(safe_chars, c) \
530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
531
532 \f
533 /*** 2. Emacs internal format (emacs-mule) handlers ***/
534
535 /* Emacs' internal format for representation of multiple character
536 sets is a kind of multi-byte encoding, i.e. characters are
537 represented by variable-length sequences of one-byte codes.
538
539 ASCII characters and control characters (e.g. `tab', `newline') are
540 represented by one-byte sequences which are their ASCII codes, in
541 the range 0x00 through 0x7F.
542
543 8-bit characters of the range 0x80..0x9F are represented by
544 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
545 code + 0x20).
546
547 8-bit characters of the range 0xA0..0xFF are represented by
548 one-byte sequences which are their 8-bit code.
549
550 The other characters are represented by a sequence of `base
551 leading-code', optional `extended leading-code', and one or two
552 `position-code's. The length of the sequence is determined by the
553 base leading-code. Leading-code takes the range 0x81 through 0x9D,
554 whereas extended leading-code and position-code take the range 0xA0
555 through 0xFF. See `charset.h' for more details about leading-code
556 and position-code.
557
558 --- CODE RANGE of Emacs' internal format ---
559 character set range
560 ------------- -----
561 ascii 0x00..0x7F
562 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
563 eight-bit-graphic 0xA0..0xBF
564 ELSE 0x81..0x9D + [0xA0..0xFF]+
565 ---------------------------------------------
566
567 As this is the internal character representation, the format is
568 usually not used externally (i.e. in a file or in a data sent to a
569 process). But, it is possible to have a text externally in this
570 format (i.e. by encoding by the coding system `emacs-mule').
571
572 In that case, a sequence of one-byte codes has a slightly different
573 form.
574
575 Firstly, all characters in eight-bit-control are represented by
576 one-byte sequences which are their 8-bit code.
577
578 Next, character composition data are represented by the byte
579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
580 where,
581 METHOD is 0xF0 plus one of composition method (enum
582 composition_method),
583
584 BYTES is 0xA0 plus the byte length of these composition data,
585
586 CHARS is 0xA0 plus the number of characters composed by these
587 data,
588
589 COMPONENTs are characters of multibyte form or composition
590 rules encoded by two-byte of ASCII codes.
591
592 In addition, for backward compatibility, the following formats are
593 also recognized as composition data on decoding.
594
595 0x80 MSEQ ...
596 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
597
598 Here,
599 MSEQ is a multibyte form but in these special format:
600 ASCII: 0xA0 ASCII_CODE+0x80,
601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
602 RULE is a one byte code of the range 0xA0..0xF0 that
603 represents a composition rule.
604 */
605
606 enum emacs_code_class_type emacs_code_class[256];
607
608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
609 Check if a text is encoded in Emacs' internal format. If it is,
610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
611
612 static int
613 detect_coding_emacs_mule (src, src_end, multibytep)
614 unsigned char *src, *src_end;
615 int multibytep;
616 {
617 unsigned char c;
618 int composing = 0;
619 /* Dummy for ONE_MORE_BYTE. */
620 struct coding_system dummy_coding;
621 struct coding_system *coding = &dummy_coding;
622
623 while (1)
624 {
625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
626
627 if (composing)
628 {
629 if (c < 0xA0)
630 composing = 0;
631 else if (c == 0xA0)
632 {
633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
634 c &= 0x7F;
635 }
636 else
637 c -= 0x20;
638 }
639
640 if (c < 0x20)
641 {
642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
643 return 0;
644 }
645 else if (c >= 0x80 && c < 0xA0)
646 {
647 if (c == 0x80)
648 /* Old leading code for a composite character. */
649 composing = 1;
650 else
651 {
652 unsigned char *src_base = src - 1;
653 int bytes;
654
655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
656 bytes))
657 return 0;
658 src = src_base + bytes;
659 }
660 }
661 }
662 label_end_of_loop:
663 return CODING_CATEGORY_MASK_EMACS_MULE;
664 }
665
666
667 /* Record the starting position START and METHOD of one composition. */
668
669 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
670 do { \
671 struct composition_data *cmp_data = coding->cmp_data; \
672 int *data = cmp_data->data + cmp_data->used; \
673 coding->cmp_data_start = cmp_data->used; \
674 data[0] = -1; \
675 data[1] = cmp_data->char_offset + start; \
676 data[3] = (int) method; \
677 cmp_data->used += 4; \
678 } while (0)
679
680 /* Record the ending position END of the current composition. */
681
682 #define CODING_ADD_COMPOSITION_END(coding, end) \
683 do { \
684 struct composition_data *cmp_data = coding->cmp_data; \
685 int *data = cmp_data->data + coding->cmp_data_start; \
686 data[0] = cmp_data->used - coding->cmp_data_start; \
687 data[2] = cmp_data->char_offset + end; \
688 } while (0)
689
690 /* Record one COMPONENT (alternate character or composition rule). */
691
692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
693 do { \
694 coding->cmp_data->data[coding->cmp_data->used++] = component; \
695 if (coding->cmp_data->used - coding->cmp_data_start \
696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \
697 { \
698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
699 coding->composing = COMPOSITION_NO; \
700 } \
701 } while (0)
702
703
704 /* Get one byte from a data pointed by SRC and increment SRC. If SRC
705 is not less than SRC_END, return -1 without incrementing Src. */
706
707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
708
709
710 /* Decode a character represented as a component of composition
711 sequence of Emacs 20 style at SRC. Set C to that character, store
712 its multibyte form sequence at P, and set P to the end of that
713 sequence. If no valid character is found, set C to -1. */
714
715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
716 do { \
717 int bytes; \
718 \
719 c = SAFE_ONE_MORE_BYTE (); \
720 if (c < 0) \
721 break; \
722 if (CHAR_HEAD_P (c)) \
723 c = -1; \
724 else if (c == 0xA0) \
725 { \
726 c = SAFE_ONE_MORE_BYTE (); \
727 if (c < 0xA0) \
728 c = -1; \
729 else \
730 { \
731 c -= 0xA0; \
732 *p++ = c; \
733 } \
734 } \
735 else if (BASE_LEADING_CODE_P (c - 0x20)) \
736 { \
737 unsigned char *p0 = p; \
738 \
739 c -= 0x20; \
740 *p++ = c; \
741 bytes = BYTES_BY_CHAR_HEAD (c); \
742 while (--bytes) \
743 { \
744 c = SAFE_ONE_MORE_BYTE (); \
745 if (c < 0) \
746 break; \
747 *p++ = c; \
748 } \
749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
750 || (coding->flags /* We are recovering a file. */ \
751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \
752 && ! CHAR_HEAD_P (p0[1]))) \
753 c = STRING_CHAR (p0, bytes); \
754 else \
755 c = -1; \
756 } \
757 else \
758 c = -1; \
759 } while (0)
760
761
762 /* Decode a composition rule represented as a component of composition
763 sequence of Emacs 20 style at SRC. Set C to the rule. If not
764 valid rule is found, set C to -1. */
765
766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \
767 do { \
768 c = SAFE_ONE_MORE_BYTE (); \
769 c -= 0xA0; \
770 if (c < 0 || c >= 81) \
771 c = -1; \
772 else \
773 { \
774 gref = c / 9, nref = c % 9; \
775 c = COMPOSITION_ENCODE_RULE (gref, nref); \
776 } \
777 } while (0)
778
779
780 /* Decode composition sequence encoded by `emacs-mule' at the source
781 pointed by SRC. SRC_END is the end of source. Store information
782 of the composition in CODING->cmp_data.
783
784 For backward compatibility, decode also a composition sequence of
785 Emacs 20 style. In that case, the composition sequence contains
786 characters that should be extracted into a buffer or string. Store
787 those characters at *DESTINATION in multibyte form.
788
789 If we encounter an invalid byte sequence, return 0.
790 If we encounter an insufficient source or destination, or
791 insufficient space in CODING->cmp_data, return 1.
792 Otherwise, return consumed bytes in the source.
793
794 */
795 static INLINE int
796 decode_composition_emacs_mule (coding, src, src_end,
797 destination, dst_end, dst_bytes)
798 struct coding_system *coding;
799 unsigned char *src, *src_end, **destination, *dst_end;
800 int dst_bytes;
801 {
802 unsigned char *dst = *destination;
803 int method, data_len, nchars;
804 unsigned char *src_base = src++;
805 /* Store components of composition. */
806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
807 int ncomponent;
808 /* Store multibyte form of characters to be composed. This is for
809 Emacs 20 style composition sequence. */
810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
811 unsigned char *bufp = buf;
812 int c, i, gref, nref;
813
814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
815 >= COMPOSITION_DATA_SIZE)
816 {
817 coding->result = CODING_FINISH_INSUFFICIENT_CMP;
818 return -1;
819 }
820
821 ONE_MORE_BYTE (c);
822 if (c - 0xF0 >= COMPOSITION_RELATIVE
823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
824 {
825 int with_rule;
826
827 method = c - 0xF0;
828 with_rule = (method == COMPOSITION_WITH_RULE
829 || method == COMPOSITION_WITH_RULE_ALTCHARS);
830 ONE_MORE_BYTE (c);
831 data_len = c - 0xA0;
832 if (data_len < 4
833 || src_base + data_len > src_end)
834 return 0;
835 ONE_MORE_BYTE (c);
836 nchars = c - 0xA0;
837 if (c < 1)
838 return 0;
839 for (ncomponent = 0; src < src_base + data_len; ncomponent++)
840 {
841 /* If it is longer than this, it can't be valid. */
842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
843 return 0;
844
845 if (ncomponent % 2 && with_rule)
846 {
847 ONE_MORE_BYTE (gref);
848 gref -= 32;
849 ONE_MORE_BYTE (nref);
850 nref -= 32;
851 c = COMPOSITION_ENCODE_RULE (gref, nref);
852 }
853 else
854 {
855 int bytes;
856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
857 || (coding->flags /* We are recovering a file. */
858 && src[0] == LEADING_CODE_8_BIT_CONTROL
859 && ! CHAR_HEAD_P (src[1])))
860 c = STRING_CHAR (src, bytes);
861 else
862 c = *src, bytes = 1;
863 src += bytes;
864 }
865 component[ncomponent] = c;
866 }
867 }
868 else
869 {
870 /* This may be an old Emacs 20 style format. See the comment at
871 the section 2 of this file. */
872 while (src < src_end && !CHAR_HEAD_P (*src)) src++;
873 if (src == src_end
874 && !(coding->mode & CODING_MODE_LAST_BLOCK))
875 goto label_end_of_loop;
876
877 src_end = src;
878 src = src_base + 1;
879 if (c < 0xC0)
880 {
881 method = COMPOSITION_RELATIVE;
882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
883 {
884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
885 if (c < 0)
886 break;
887 component[ncomponent++] = c;
888 }
889 if (ncomponent < 2)
890 return 0;
891 nchars = ncomponent;
892 }
893 else if (c == 0xFF)
894 {
895 method = COMPOSITION_WITH_RULE;
896 src++;
897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
898 if (c < 0)
899 return 0;
900 component[0] = c;
901 for (ncomponent = 1;
902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
903 {
904 DECODE_EMACS_MULE_COMPOSITION_RULE (c);
905 if (c < 0)
906 break;
907 component[ncomponent++] = c;
908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
909 if (c < 0)
910 break;
911 component[ncomponent++] = c;
912 }
913 if (ncomponent < 3)
914 return 0;
915 nchars = (ncomponent + 1) / 2;
916 }
917 else
918 return 0;
919 }
920
921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
922 {
923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
924 for (i = 0; i < ncomponent; i++)
925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
927 if (buf < bufp)
928 {
929 unsigned char *p = buf;
930 EMIT_BYTES (p, bufp);
931 *destination += bufp - buf;
932 coding->produced_char += nchars;
933 }
934 return (src - src_base);
935 }
936 label_end_of_loop:
937 return -1;
938 }
939
940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
941
942 static void
943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
944 struct coding_system *coding;
945 unsigned char *source, *destination;
946 int src_bytes, dst_bytes;
947 {
948 unsigned char *src = source;
949 unsigned char *src_end = source + src_bytes;
950 unsigned char *dst = destination;
951 unsigned char *dst_end = destination + dst_bytes;
952 /* SRC_BASE remembers the start position in source in each loop.
953 The loop will be exited when there's not enough source code, or
954 when there's not enough destination area to produce a
955 character. */
956 unsigned char *src_base;
957
958 coding->produced_char = 0;
959 while ((src_base = src) < src_end)
960 {
961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
962 int bytes;
963
964 if (*src == '\r')
965 {
966 int c = *src++;
967
968 if (coding->eol_type == CODING_EOL_CR)
969 c = '\n';
970 else if (coding->eol_type == CODING_EOL_CRLF)
971 {
972 ONE_MORE_BYTE (c);
973 if (c != '\n')
974 {
975 src--;
976 c = '\r';
977 }
978 }
979 *dst++ = c;
980 coding->produced_char++;
981 continue;
982 }
983 else if (*src == '\n')
984 {
985 if ((coding->eol_type == CODING_EOL_CR
986 || coding->eol_type == CODING_EOL_CRLF)
987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
988 {
989 coding->result = CODING_FINISH_INCONSISTENT_EOL;
990 goto label_end_of_loop;
991 }
992 *dst++ = *src++;
993 coding->produced_char++;
994 continue;
995 }
996 else if (*src == 0x80 && coding->cmp_data)
997 {
998 /* Start of composition data. */
999 int consumed = decode_composition_emacs_mule (coding, src, src_end,
1000 &dst, dst_end,
1001 dst_bytes);
1002 if (consumed < 0)
1003 goto label_end_of_loop;
1004 else if (consumed > 0)
1005 {
1006 src += consumed;
1007 continue;
1008 }
1009 bytes = CHAR_STRING (*src, tmp);
1010 p = tmp;
1011 src++;
1012 }
1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014 || (coding->flags /* We are recovering a file. */
1015 && src[0] == LEADING_CODE_8_BIT_CONTROL
1016 && ! CHAR_HEAD_P (src[1])))
1017 {
1018 p = src;
1019 src += bytes;
1020 }
1021 else
1022 {
1023 int i, c;
1024
1025 bytes = BYTES_BY_CHAR_HEAD (*src);
1026 src++;
1027 for (i = 1; i < bytes; i++)
1028 {
1029 ONE_MORE_BYTE (c);
1030 if (CHAR_HEAD_P (c))
1031 break;
1032 }
1033 if (i < bytes)
1034 {
1035 bytes = CHAR_STRING (*src_base, tmp);
1036 p = tmp;
1037 src = src_base + 1;
1038 }
1039 else
1040 {
1041 p = src_base;
1042 }
1043 }
1044 if (dst + bytes >= (dst_bytes ? dst_end : src))
1045 {
1046 coding->result = CODING_FINISH_INSUFFICIENT_DST;
1047 break;
1048 }
1049 while (bytes--) *dst++ = *p++;
1050 coding->produced_char++;
1051 }
1052 label_end_of_loop:
1053 coding->consumed = coding->consumed_char = src_base - source;
1054 coding->produced = dst - destination;
1055 }
1056
1057
1058 /* Encode composition data stored at DATA into a special byte sequence
1059 starting by 0x80. Update CODING->cmp_data_start and maybe
1060 CODING->cmp_data for the next call. */
1061
1062 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \
1063 do { \
1064 unsigned char buf[1024], *p0 = buf, *p; \
1065 int len = data[0]; \
1066 int i; \
1067 \
1068 buf[0] = 0x80; \
1069 buf[1] = 0xF0 + data[3]; /* METHOD */ \
1070 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \
1071 p = buf + 4; \
1072 if (data[3] == COMPOSITION_WITH_RULE \
1073 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \
1074 { \
1075 p += CHAR_STRING (data[4], p); \
1076 for (i = 5; i < len; i += 2) \
1077 { \
1078 int gref, nref; \
1079 COMPOSITION_DECODE_RULE (data[i], gref, nref); \
1080 *p++ = 0x20 + gref; \
1081 *p++ = 0x20 + nref; \
1082 p += CHAR_STRING (data[i + 1], p); \
1083 } \
1084 } \
1085 else \
1086 { \
1087 for (i = 4; i < len; i++) \
1088 p += CHAR_STRING (data[i], p); \
1089 } \
1090 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \
1091 \
1092 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \
1093 { \
1094 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
1095 goto label_end_of_loop; \
1096 } \
1097 while (p0 < p) \
1098 *dst++ = *p0++; \
1099 coding->cmp_data_start += data[0]; \
1100 if (coding->cmp_data_start == coding->cmp_data->used \
1101 && coding->cmp_data->next) \
1102 { \
1103 coding->cmp_data = coding->cmp_data->next; \
1104 coding->cmp_data_start = 0; \
1105 } \
1106 } while (0)
1107
1108
1109 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1110 unsigned char *, int, int));
1111
1112 static void
1113 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1114 struct coding_system *coding;
1115 unsigned char *source, *destination;
1116 int src_bytes, dst_bytes;
1117 {
1118 unsigned char *src = source;
1119 unsigned char *src_end = source + src_bytes;
1120 unsigned char *dst = destination;
1121 unsigned char *dst_end = destination + dst_bytes;
1122 unsigned char *src_base;
1123 int c;
1124 int char_offset;
1125 int *data;
1126
1127 Lisp_Object translation_table;
1128
1129 translation_table = Qnil;
1130
1131 /* Optimization for the case that there's no composition. */
1132 if (!coding->cmp_data || coding->cmp_data->used == 0)
1133 {
1134 encode_eol (coding, source, destination, src_bytes, dst_bytes);
1135 return;
1136 }
1137
1138 char_offset = coding->cmp_data->char_offset;
1139 data = coding->cmp_data->data + coding->cmp_data_start;
1140 while (1)
1141 {
1142 src_base = src;
1143
1144 /* If SRC starts a composition, encode the information about the
1145 composition in advance. */
1146 if (coding->cmp_data_start < coding->cmp_data->used
1147 && char_offset + coding->consumed_char == data[1])
1148 {
1149 ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1150 char_offset = coding->cmp_data->char_offset;
1151 data = coding->cmp_data->data + coding->cmp_data_start;
1152 }
1153
1154 ONE_MORE_CHAR (c);
1155 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1156 || coding->eol_type == CODING_EOL_CR))
1157 {
1158 if (coding->eol_type == CODING_EOL_CRLF)
1159 EMIT_TWO_BYTES ('\r', c);
1160 else
1161 EMIT_ONE_BYTE ('\r');
1162 }
1163 else if (SINGLE_BYTE_CHAR_P (c))
1164 {
1165 if (coding->flags && ! ASCII_BYTE_P (c))
1166 {
1167 /* As we are auto saving, retain the multibyte form for
1168 8-bit chars. */
1169 unsigned char buf[MAX_MULTIBYTE_LENGTH];
1170 int bytes = CHAR_STRING (c, buf);
1171
1172 if (bytes == 1)
1173 EMIT_ONE_BYTE (buf[0]);
1174 else
1175 EMIT_TWO_BYTES (buf[0], buf[1]);
1176 }
1177 else
1178 EMIT_ONE_BYTE (c);
1179 }
1180 else
1181 EMIT_BYTES (src_base, src);
1182 coding->consumed_char++;
1183 }
1184 label_end_of_loop:
1185 coding->consumed = src_base - source;
1186 coding->produced = coding->produced_char = dst - destination;
1187 return;
1188 }
1189
1190 \f
1191 /*** 3. ISO2022 handlers ***/
1192
1193 /* The following note describes the coding system ISO2022 briefly.
1194 Since the intention of this note is to help understand the
1195 functions in this file, some parts are NOT ACCURATE or are OVERLY
1196 SIMPLIFIED. For thorough understanding, please refer to the
1197 original document of ISO2022. This is equivalent to the standard
1198 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1199
1200 ISO2022 provides many mechanisms to encode several character sets
1201 in 7-bit and 8-bit environments. For 7-bit environments, all text
1202 is encoded using bytes less than 128. This may make the encoded
1203 text a little bit longer, but the text passes more easily through
1204 several types of gateway, some of which strip off the MSB (Most
1205 Significant Bit).
1206
1207 There are two kinds of character sets: control character sets and
1208 graphic character sets. The former contain control characters such
1209 as `newline' and `escape' to provide control functions (control
1210 functions are also provided by escape sequences). The latter
1211 contain graphic characters such as 'A' and '-'. Emacs recognizes
1212 two control character sets and many graphic character sets.
1213
1214 Graphic character sets are classified into one of the following
1215 four classes, according to the number of bytes (DIMENSION) and
1216 number of characters in one dimension (CHARS) of the set:
1217 - DIMENSION1_CHARS94
1218 - DIMENSION1_CHARS96
1219 - DIMENSION2_CHARS94
1220 - DIMENSION2_CHARS96
1221
1222 In addition, each character set is assigned an identification tag,
1223 unique for each set, called the "final character" (denoted as <F>
1224 hereafter). The <F> of each character set is decided by ECMA(*)
1225 when it is registered in ISO. The code range of <F> is 0x30..0x7F
1226 (0x30..0x3F are for private use only).
1227
1228 Note (*): ECMA = European Computer Manufacturers Association
1229
1230 Here are examples of graphic character sets [NAME(<F>)]:
1231 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1232 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1233 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1234 o DIMENSION2_CHARS96 -- none for the moment
1235
1236 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1237 C0 [0x00..0x1F] -- control character plane 0
1238 GL [0x20..0x7F] -- graphic character plane 0
1239 C1 [0x80..0x9F] -- control character plane 1
1240 GR [0xA0..0xFF] -- graphic character plane 1
1241
1242 A control character set is directly designated and invoked to C0 or
1243 C1 by an escape sequence. The most common case is that:
1244 - ISO646's control character set is designated/invoked to C0, and
1245 - ISO6429's control character set is designated/invoked to C1,
1246 and usually these designations/invocations are omitted in encoded
1247 text. In a 7-bit environment, only C0 can be used, and a control
1248 character for C1 is encoded by an appropriate escape sequence to
1249 fit into the environment. All control characters for C1 are
1250 defined to have corresponding escape sequences.
1251
1252 A graphic character set is at first designated to one of four
1253 graphic registers (G0 through G3), then these graphic registers are
1254 invoked to GL or GR. These designations and invocations can be
1255 done independently. The most common case is that G0 is invoked to
1256 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1257 these invocations and designations are omitted in encoded text.
1258 In a 7-bit environment, only GL can be used.
1259
1260 When a graphic character set of CHARS94 is invoked to GL, codes
1261 0x20 and 0x7F of the GL area work as control characters SPACE and
1262 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1263 be used.
1264
1265 There are two ways of invocation: locking-shift and single-shift.
1266 With locking-shift, the invocation lasts until the next different
1267 invocation, whereas with single-shift, the invocation affects the
1268 following character only and doesn't affect the locking-shift
1269 state. Invocations are done by the following control characters or
1270 escape sequences:
1271
1272 ----------------------------------------------------------------------
1273 abbrev function cntrl escape seq description
1274 ----------------------------------------------------------------------
1275 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1276 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1277 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1278 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1279 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1280 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1281 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1282 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1283 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1284 ----------------------------------------------------------------------
1285 (*) These are not used by any known coding system.
1286
1287 Control characters for these functions are defined by macros
1288 ISO_CODE_XXX in `coding.h'.
1289
1290 Designations are done by the following escape sequences:
1291 ----------------------------------------------------------------------
1292 escape sequence description
1293 ----------------------------------------------------------------------
1294 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1295 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1296 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1297 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1298 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1299 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1300 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1301 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1302 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1303 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1304 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1305 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1306 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1307 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1308 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1309 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1310 ----------------------------------------------------------------------
1311
1312 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1313 of dimension 1, chars 94, and final character <F>, etc...
1314
1315 Note (*): Although these designations are not allowed in ISO2022,
1316 Emacs accepts them on decoding, and produces them on encoding
1317 CHARS96 character sets in a coding system which is characterized as
1318 7-bit environment, non-locking-shift, and non-single-shift.
1319
1320 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1321 '(' can be omitted. We refer to this as "short-form" hereafter.
1322
1323 Now you may notice that there are a lot of ways of encoding the
1324 same multilingual text in ISO2022. Actually, there exist many
1325 coding systems such as Compound Text (used in X11's inter client
1326 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1327 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1328 localized platforms), and all of these are variants of ISO2022.
1329
1330 In addition to the above, Emacs handles two more kinds of escape
1331 sequences: ISO6429's direction specification and Emacs' private
1332 sequence for specifying character composition.
1333
1334 ISO6429's direction specification takes the following form:
1335 o CSI ']' -- end of the current direction
1336 o CSI '0' ']' -- end of the current direction
1337 o CSI '1' ']' -- start of left-to-right text
1338 o CSI '2' ']' -- start of right-to-left text
1339 The control character CSI (0x9B: control sequence introducer) is
1340 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1341
1342 Character composition specification takes the following form:
1343 o ESC '0' -- start relative composition
1344 o ESC '1' -- end composition
1345 o ESC '2' -- start rule-base composition (*)
1346 o ESC '3' -- start relative composition with alternate chars (**)
1347 o ESC '4' -- start rule-base composition with alternate chars (**)
1348 Since these are not standard escape sequences of any ISO standard,
1349 the use of them with these meanings is restricted to Emacs only.
1350
1351 (*) This form is used only in Emacs 20.5 and older versions,
1352 but the newer versions can safely decode it.
1353 (**) This form is used only in Emacs 21.1 and newer versions,
1354 and the older versions can't decode it.
1355
1356 Here's a list of example usages of these composition escape
1357 sequences (categorized by `enum composition_method').
1358
1359 COMPOSITION_RELATIVE:
1360 ESC 0 CHAR [ CHAR ] ESC 1
1361 COMPOSITION_WITH_RULE:
1362 ESC 2 CHAR [ RULE CHAR ] ESC 1
1363 COMPOSITION_WITH_ALTCHARS:
1364 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1365 COMPOSITION_WITH_RULE_ALTCHARS:
1366 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1367
1368 enum iso_code_class_type iso_code_class[256];
1369
1370 #define CHARSET_OK(idx, charset, c) \
1371 (coding_system_table[idx] \
1372 && (charset == CHARSET_ASCII \
1373 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1374 CODING_SAFE_CHAR_P (safe_chars, c))) \
1375 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1376 charset) \
1377 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1378
1379 #define SHIFT_OUT_OK(idx) \
1380 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1381
1382 #define COMPOSITION_OK(idx) \
1383 (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1384
1385 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1386 Check if a text is encoded in ISO2022. If it is, return an
1387 integer in which appropriate flag bits any of:
1388 CODING_CATEGORY_MASK_ISO_7
1389 CODING_CATEGORY_MASK_ISO_7_TIGHT
1390 CODING_CATEGORY_MASK_ISO_8_1
1391 CODING_CATEGORY_MASK_ISO_8_2
1392 CODING_CATEGORY_MASK_ISO_7_ELSE
1393 CODING_CATEGORY_MASK_ISO_8_ELSE
1394 are set. If a code which should never appear in ISO2022 is found,
1395 returns 0. */
1396
1397 static int
1398 detect_coding_iso2022 (src, src_end, multibytep)
1399 unsigned char *src, *src_end;
1400 int multibytep;
1401 {
1402 int mask = CODING_CATEGORY_MASK_ISO;
1403 int mask_found = 0;
1404 int reg[4], shift_out = 0, single_shifting = 0;
1405 int c, c1, charset;
1406 /* Dummy for ONE_MORE_BYTE. */
1407 struct coding_system dummy_coding;
1408 struct coding_system *coding = &dummy_coding;
1409 Lisp_Object safe_chars;
1410
1411 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1412 while (mask && src < src_end)
1413 {
1414 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1415 retry:
1416 switch (c)
1417 {
1418 case ISO_CODE_ESC:
1419 if (inhibit_iso_escape_detection)
1420 break;
1421 single_shifting = 0;
1422 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1423 if (c >= '(' && c <= '/')
1424 {
1425 /* Designation sequence for a charset of dimension 1. */
1426 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1427 if (c1 < ' ' || c1 >= 0x80
1428 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1429 /* Invalid designation sequence. Just ignore. */
1430 break;
1431 reg[(c - '(') % 4] = charset;
1432 }
1433 else if (c == '$')
1434 {
1435 /* Designation sequence for a charset of dimension 2. */
1436 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1437 if (c >= '@' && c <= 'B')
1438 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
1439 reg[0] = charset = iso_charset_table[1][0][c];
1440 else if (c >= '(' && c <= '/')
1441 {
1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1443 if (c1 < ' ' || c1 >= 0x80
1444 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1445 /* Invalid designation sequence. Just ignore. */
1446 break;
1447 reg[(c - '(') % 4] = charset;
1448 }
1449 else
1450 /* Invalid designation sequence. Just ignore. */
1451 break;
1452 }
1453 else if (c == 'N' || c == 'O')
1454 {
1455 /* ESC <Fe> for SS2 or SS3. */
1456 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1457 break;
1458 }
1459 else if (c >= '0' && c <= '4')
1460 {
1461 /* ESC <Fp> for start/end composition. */
1462 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1463 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1464 else
1465 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1466 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1467 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1468 else
1469 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1470 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1471 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1472 else
1473 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1474 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1475 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1476 else
1477 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1478 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1479 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1480 else
1481 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1482 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1483 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1484 else
1485 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1486 break;
1487 }
1488 else
1489 /* Invalid escape sequence. Just ignore. */
1490 break;
1491
1492 /* We found a valid designation sequence for CHARSET. */
1493 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1494 c = MAKE_CHAR (charset, 0, 0);
1495 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1496 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1497 else
1498 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1499 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1500 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1501 else
1502 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1503 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1504 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1505 else
1506 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1507 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1508 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1509 else
1510 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1511 break;
1512
1513 case ISO_CODE_SO:
1514 if (inhibit_iso_escape_detection)
1515 break;
1516 single_shifting = 0;
1517 if (shift_out == 0
1518 && (reg[1] >= 0
1519 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1520 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1521 {
1522 /* Locking shift out. */
1523 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1524 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1525 }
1526 break;
1527
1528 case ISO_CODE_SI:
1529 if (inhibit_iso_escape_detection)
1530 break;
1531 single_shifting = 0;
1532 if (shift_out == 1)
1533 {
1534 /* Locking shift in. */
1535 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1536 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1537 }
1538 break;
1539
1540 case ISO_CODE_CSI:
1541 single_shifting = 0;
1542 case ISO_CODE_SS2:
1543 case ISO_CODE_SS3:
1544 {
1545 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1546
1547 if (inhibit_iso_escape_detection)
1548 break;
1549 if (c != ISO_CODE_CSI)
1550 {
1551 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1552 & CODING_FLAG_ISO_SINGLE_SHIFT)
1553 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1554 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1555 & CODING_FLAG_ISO_SINGLE_SHIFT)
1556 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1557 single_shifting = 1;
1558 }
1559 if (VECTORP (Vlatin_extra_code_table)
1560 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1561 {
1562 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1563 & CODING_FLAG_ISO_LATIN_EXTRA)
1564 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1565 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1566 & CODING_FLAG_ISO_LATIN_EXTRA)
1567 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1568 }
1569 mask &= newmask;
1570 mask_found |= newmask;
1571 }
1572 break;
1573
1574 default:
1575 if (c < 0x80)
1576 {
1577 single_shifting = 0;
1578 break;
1579 }
1580 else if (c < 0xA0)
1581 {
1582 single_shifting = 0;
1583 if (VECTORP (Vlatin_extra_code_table)
1584 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1585 {
1586 int newmask = 0;
1587
1588 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1589 & CODING_FLAG_ISO_LATIN_EXTRA)
1590 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1591 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1592 & CODING_FLAG_ISO_LATIN_EXTRA)
1593 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1594 mask &= newmask;
1595 mask_found |= newmask;
1596 }
1597 else
1598 return 0;
1599 }
1600 else
1601 {
1602 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1603 | CODING_CATEGORY_MASK_ISO_7_ELSE);
1604 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1605 /* Check the length of succeeding codes of the range
1606 0xA0..0FF. If the byte length is odd, we exclude
1607 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
1608 when we are not single shifting. */
1609 if (!single_shifting
1610 && mask & CODING_CATEGORY_MASK_ISO_8_2)
1611 {
1612 int i = 1;
1613
1614 c = -1;
1615 while (src < src_end)
1616 {
1617 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1618 if (c < 0xA0)
1619 break;
1620 i++;
1621 }
1622
1623 if (i & 1 && src < src_end)
1624 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1625 else
1626 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1627 if (c >= 0)
1628 /* This means that we have read one extra byte. */
1629 goto retry;
1630 }
1631 }
1632 break;
1633 }
1634 }
1635 label_end_of_loop:
1636 return (mask & mask_found);
1637 }
1638
1639 /* Decode a character of which charset is CHARSET, the 1st position
1640 code is C1, the 2nd position code is C2, and return the decoded
1641 character code. If the variable `translation_table' is non-nil,
1642 returned the translated code. */
1643
1644 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1645 (NILP (translation_table) \
1646 ? MAKE_CHAR (charset, c1, c2) \
1647 : translate_char (translation_table, -1, charset, c1, c2))
1648
1649 /* Set designation state into CODING. */
1650 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1651 do { \
1652 int charset, c; \
1653 \
1654 if (final_char < '0' || final_char >= 128) \
1655 goto label_invalid_code; \
1656 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1657 make_number (chars), \
1658 make_number (final_char)); \
1659 c = MAKE_CHAR (charset, 0, 0); \
1660 if (charset >= 0 \
1661 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1662 || CODING_SAFE_CHAR_P (safe_chars, c))) \
1663 { \
1664 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1665 && reg == 0 \
1666 && charset == CHARSET_ASCII) \
1667 { \
1668 /* We should insert this designation sequence as is so \
1669 that it is surely written back to a file. */ \
1670 coding->spec.iso2022.last_invalid_designation_register = -1; \
1671 goto label_invalid_code; \
1672 } \
1673 coding->spec.iso2022.last_invalid_designation_register = -1; \
1674 if ((coding->mode & CODING_MODE_DIRECTION) \
1675 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1676 charset = CHARSET_REVERSE_CHARSET (charset); \
1677 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1678 } \
1679 else \
1680 { \
1681 coding->spec.iso2022.last_invalid_designation_register = reg; \
1682 goto label_invalid_code; \
1683 } \
1684 } while (0)
1685
1686 /* Allocate a memory block for storing information about compositions.
1687 The block is chained to the already allocated blocks. */
1688
1689 void
1690 coding_allocate_composition_data (coding, char_offset)
1691 struct coding_system *coding;
1692 int char_offset;
1693 {
1694 struct composition_data *cmp_data
1695 = (struct composition_data *) xmalloc (sizeof *cmp_data);
1696
1697 cmp_data->char_offset = char_offset;
1698 cmp_data->used = 0;
1699 cmp_data->prev = coding->cmp_data;
1700 cmp_data->next = NULL;
1701 if (coding->cmp_data)
1702 coding->cmp_data->next = cmp_data;
1703 coding->cmp_data = cmp_data;
1704 coding->cmp_data_start = 0;
1705 coding->composing = COMPOSITION_NO;
1706 }
1707
1708 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1709 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1710 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1711 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1712 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1713 */
1714
1715 #define DECODE_COMPOSITION_START(c1) \
1716 do { \
1717 if (coding->composing == COMPOSITION_DISABLED) \
1718 { \
1719 *dst++ = ISO_CODE_ESC; \
1720 *dst++ = c1 & 0x7f; \
1721 coding->produced_char += 2; \
1722 } \
1723 else if (!COMPOSING_P (coding)) \
1724 { \
1725 /* This is surely the start of a composition. We must be sure \
1726 that coding->cmp_data has enough space to store the \
1727 information about the composition. If not, terminate the \
1728 current decoding loop, allocate one more memory block for \
1729 coding->cmp_data in the caller, then start the decoding \
1730 loop again. We can't allocate memory here directly because \
1731 it may cause buffer/string relocation. */ \
1732 if (!coding->cmp_data \
1733 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1734 >= COMPOSITION_DATA_SIZE)) \
1735 { \
1736 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1737 goto label_end_of_loop; \
1738 } \
1739 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1740 : c1 == '2' ? COMPOSITION_WITH_RULE \
1741 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1742 : COMPOSITION_WITH_RULE_ALTCHARS); \
1743 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1744 coding->composing); \
1745 coding->composition_rule_follows = 0; \
1746 } \
1747 else \
1748 { \
1749 /* We are already handling a composition. If the method is \
1750 the following two, the codes following the current escape \
1751 sequence are actual characters stored in a buffer. */ \
1752 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1753 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1754 { \
1755 coding->composing = COMPOSITION_RELATIVE; \
1756 coding->composition_rule_follows = 0; \
1757 } \
1758 } \
1759 } while (0)
1760
1761 /* Handle composition end sequence ESC 1. */
1762
1763 #define DECODE_COMPOSITION_END(c1) \
1764 do { \
1765 if (! COMPOSING_P (coding)) \
1766 { \
1767 *dst++ = ISO_CODE_ESC; \
1768 *dst++ = c1; \
1769 coding->produced_char += 2; \
1770 } \
1771 else \
1772 { \
1773 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1774 coding->composing = COMPOSITION_NO; \
1775 } \
1776 } while (0)
1777
1778 /* Decode a composition rule from the byte C1 (and maybe one more byte
1779 from SRC) and store one encoded composition rule in
1780 coding->cmp_data. */
1781
1782 #define DECODE_COMPOSITION_RULE(c1) \
1783 do { \
1784 int rule = 0; \
1785 (c1) -= 32; \
1786 if (c1 < 81) /* old format (before ver.21) */ \
1787 { \
1788 int gref = (c1) / 9; \
1789 int nref = (c1) % 9; \
1790 if (gref == 4) gref = 10; \
1791 if (nref == 4) nref = 10; \
1792 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1793 } \
1794 else if (c1 < 93) /* new format (after ver.21) */ \
1795 { \
1796 ONE_MORE_BYTE (c2); \
1797 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1798 } \
1799 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1800 coding->composition_rule_follows = 0; \
1801 } while (0)
1802
1803
1804 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1805
1806 static void
1807 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1808 struct coding_system *coding;
1809 unsigned char *source, *destination;
1810 int src_bytes, dst_bytes;
1811 {
1812 unsigned char *src = source;
1813 unsigned char *src_end = source + src_bytes;
1814 unsigned char *dst = destination;
1815 unsigned char *dst_end = destination + dst_bytes;
1816 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1817 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1818 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1819 /* SRC_BASE remembers the start position in source in each loop.
1820 The loop will be exited when there's not enough source code
1821 (within macro ONE_MORE_BYTE), or when there's not enough
1822 destination area to produce a character (within macro
1823 EMIT_CHAR). */
1824 unsigned char *src_base;
1825 int c, charset;
1826 Lisp_Object translation_table;
1827 Lisp_Object safe_chars;
1828
1829 safe_chars = coding_safe_chars (coding->symbol);
1830
1831 if (NILP (Venable_character_translation))
1832 translation_table = Qnil;
1833 else
1834 {
1835 translation_table = coding->translation_table_for_decode;
1836 if (NILP (translation_table))
1837 translation_table = Vstandard_translation_table_for_decode;
1838 }
1839
1840 coding->result = CODING_FINISH_NORMAL;
1841
1842 while (1)
1843 {
1844 int c1, c2 = 0;
1845
1846 src_base = src;
1847 ONE_MORE_BYTE (c1);
1848
1849 /* We produce no character or one character. */
1850 switch (iso_code_class [c1])
1851 {
1852 case ISO_0x20_or_0x7F:
1853 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1854 {
1855 DECODE_COMPOSITION_RULE (c1);
1856 continue;
1857 }
1858 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1859 {
1860 /* This is SPACE or DEL. */
1861 charset = CHARSET_ASCII;
1862 break;
1863 }
1864 /* This is a graphic character, we fall down ... */
1865
1866 case ISO_graphic_plane_0:
1867 if (COMPOSING_P (coding) && coding->composition_rule_follows)
1868 {
1869 DECODE_COMPOSITION_RULE (c1);
1870 continue;
1871 }
1872 charset = charset0;
1873 break;
1874
1875 case ISO_0xA0_or_0xFF:
1876 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1877 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1878 goto label_invalid_code;
1879 /* This is a graphic character, we fall down ... */
1880
1881 case ISO_graphic_plane_1:
1882 if (charset1 < 0)
1883 goto label_invalid_code;
1884 charset = charset1;
1885 break;
1886
1887 case ISO_control_0:
1888 if (COMPOSING_P (coding))
1889 DECODE_COMPOSITION_END ('1');
1890
1891 /* All ISO2022 control characters in this class have the
1892 same representation in Emacs internal format. */
1893 if (c1 == '\n'
1894 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1895 && (coding->eol_type == CODING_EOL_CR
1896 || coding->eol_type == CODING_EOL_CRLF))
1897 {
1898 coding->result = CODING_FINISH_INCONSISTENT_EOL;
1899 goto label_end_of_loop;
1900 }
1901 charset = CHARSET_ASCII;
1902 break;
1903
1904 case ISO_control_1:
1905 if (COMPOSING_P (coding))
1906 DECODE_COMPOSITION_END ('1');
1907 goto label_invalid_code;
1908
1909 case ISO_carriage_return:
1910 if (COMPOSING_P (coding))
1911 DECODE_COMPOSITION_END ('1');
1912
1913 if (coding->eol_type == CODING_EOL_CR)
1914 c1 = '\n';
1915 else if (coding->eol_type == CODING_EOL_CRLF)
1916 {
1917 ONE_MORE_BYTE (c1);
1918 if (c1 != ISO_CODE_LF)
1919 {
1920 src--;
1921 c1 = '\r';
1922 }
1923 }
1924 charset = CHARSET_ASCII;
1925 break;
1926
1927 case ISO_shift_out:
1928 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1929 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1930 goto label_invalid_code;
1931 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1932 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1933 continue;
1934
1935 case ISO_shift_in:
1936 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1937 goto label_invalid_code;
1938 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1939 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1940 continue;
1941
1942 case ISO_single_shift_2_7:
1943 case ISO_single_shift_2:
1944 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1945 goto label_invalid_code;
1946 /* SS2 is handled as an escape sequence of ESC 'N' */
1947 c1 = 'N';
1948 goto label_escape_sequence;
1949
1950 case ISO_single_shift_3:
1951 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1952 goto label_invalid_code;
1953 /* SS2 is handled as an escape sequence of ESC 'O' */
1954 c1 = 'O';
1955 goto label_escape_sequence;
1956
1957 case ISO_control_sequence_introducer:
1958 /* CSI is handled as an escape sequence of ESC '[' ... */
1959 c1 = '[';
1960 goto label_escape_sequence;
1961
1962 case ISO_escape:
1963 ONE_MORE_BYTE (c1);
1964 label_escape_sequence:
1965 /* Escape sequences handled by Emacs are invocation,
1966 designation, direction specification, and character
1967 composition specification. */
1968 switch (c1)
1969 {
1970 case '&': /* revision of following character set */
1971 ONE_MORE_BYTE (c1);
1972 if (!(c1 >= '@' && c1 <= '~'))
1973 goto label_invalid_code;
1974 ONE_MORE_BYTE (c1);
1975 if (c1 != ISO_CODE_ESC)
1976 goto label_invalid_code;
1977 ONE_MORE_BYTE (c1);
1978 goto label_escape_sequence;
1979
1980 case '$': /* designation of 2-byte character set */
1981 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1982 goto label_invalid_code;
1983 ONE_MORE_BYTE (c1);
1984 if (c1 >= '@' && c1 <= 'B')
1985 { /* designation of JISX0208.1978, GB2312.1980,
1986 or JISX0208.1980 */
1987 DECODE_DESIGNATION (0, 2, 94, c1);
1988 }
1989 else if (c1 >= 0x28 && c1 <= 0x2B)
1990 { /* designation of DIMENSION2_CHARS94 character set */
1991 ONE_MORE_BYTE (c2);
1992 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1993 }
1994 else if (c1 >= 0x2C && c1 <= 0x2F)
1995 { /* designation of DIMENSION2_CHARS96 character set */
1996 ONE_MORE_BYTE (c2);
1997 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1998 }
1999 else
2000 goto label_invalid_code;
2001 /* We must update these variables now. */
2002 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2003 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2004 continue;
2005
2006 case 'n': /* invocation of locking-shift-2 */
2007 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2008 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2009 goto label_invalid_code;
2010 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2011 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2012 continue;
2013
2014 case 'o': /* invocation of locking-shift-3 */
2015 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2017 goto label_invalid_code;
2018 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2019 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020 continue;
2021
2022 case 'N': /* invocation of single-shift-2 */
2023 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2024 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2025 goto label_invalid_code;
2026 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2027 ONE_MORE_BYTE (c1);
2028 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2029 goto label_invalid_code;
2030 break;
2031
2032 case 'O': /* invocation of single-shift-3 */
2033 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2034 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2035 goto label_invalid_code;
2036 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2037 ONE_MORE_BYTE (c1);
2038 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2039 goto label_invalid_code;
2040 break;
2041
2042 case '0': case '2': case '3': case '4': /* start composition */
2043 DECODE_COMPOSITION_START (c1);
2044 continue;
2045
2046 case '1': /* end composition */
2047 DECODE_COMPOSITION_END (c1);
2048 continue;
2049
2050 case '[': /* specification of direction */
2051 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2052 goto label_invalid_code;
2053 /* For the moment, nested direction is not supported.
2054 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2055 left-to-right, and nonzero means right-to-left. */
2056 ONE_MORE_BYTE (c1);
2057 switch (c1)
2058 {
2059 case ']': /* end of the current direction */
2060 coding->mode &= ~CODING_MODE_DIRECTION;
2061
2062 case '0': /* end of the current direction */
2063 case '1': /* start of left-to-right direction */
2064 ONE_MORE_BYTE (c1);
2065 if (c1 == ']')
2066 coding->mode &= ~CODING_MODE_DIRECTION;
2067 else
2068 goto label_invalid_code;
2069 break;
2070
2071 case '2': /* start of right-to-left direction */
2072 ONE_MORE_BYTE (c1);
2073 if (c1 == ']')
2074 coding->mode |= CODING_MODE_DIRECTION;
2075 else
2076 goto label_invalid_code;
2077 break;
2078
2079 default:
2080 goto label_invalid_code;
2081 }
2082 continue;
2083
2084 case '%':
2085 if (COMPOSING_P (coding))
2086 DECODE_COMPOSITION_END ('1');
2087 ONE_MORE_BYTE (c1);
2088 if (c1 == '/')
2089 {
2090 /* CTEXT extended segment:
2091 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2092 We keep these bytes as is for the moment.
2093 They may be decoded by post-read-conversion. */
2094 int dim, M, L;
2095 int size, required;
2096 int produced_chars;
2097
2098 ONE_MORE_BYTE (dim);
2099 ONE_MORE_BYTE (M);
2100 ONE_MORE_BYTE (L);
2101 size = ((M - 128) * 128) + (L - 128);
2102 required = 8 + size * 2;
2103 if (dst + required > (dst_bytes ? dst_end : src))
2104 goto label_end_of_loop;
2105 *dst++ = ISO_CODE_ESC;
2106 *dst++ = '%';
2107 *dst++ = '/';
2108 *dst++ = dim;
2109 produced_chars = 4;
2110 dst += CHAR_STRING (M, dst), produced_chars++;
2111 dst += CHAR_STRING (L, dst), produced_chars++;
2112 while (size-- > 0)
2113 {
2114 ONE_MORE_BYTE (c1);
2115 dst += CHAR_STRING (c1, dst), produced_chars++;
2116 }
2117 coding->produced_char += produced_chars;
2118 }
2119 else if (c1 == 'G')
2120 {
2121 unsigned char *d = dst;
2122 int produced_chars;
2123
2124 /* XFree86 extension for embedding UTF-8 in CTEXT:
2125 ESC % G --UTF-8-BYTES-- ESC % @
2126 We keep these bytes as is for the moment.
2127 They may be decoded by post-read-conversion. */
2128 if (d + 6 > (dst_bytes ? dst_end : src))
2129 goto label_end_of_loop;
2130 *d++ = ISO_CODE_ESC;
2131 *d++ = '%';
2132 *d++ = 'G';
2133 produced_chars = 3;
2134 while (d + 1 < (dst_bytes ? dst_end : src))
2135 {
2136 ONE_MORE_BYTE (c1);
2137 if (c1 == ISO_CODE_ESC
2138 && src + 1 < src_end
2139 && src[0] == '%'
2140 && src[1] == '@')
2141 {
2142 src += 2;
2143 break;
2144 }
2145 d += CHAR_STRING (c1, d), produced_chars++;
2146 }
2147 if (d + 3 > (dst_bytes ? dst_end : src))
2148 goto label_end_of_loop;
2149 *d++ = ISO_CODE_ESC;
2150 *d++ = '%';
2151 *d++ = '@';
2152 dst = d;
2153 coding->produced_char += produced_chars + 3;
2154 }
2155 else
2156 goto label_invalid_code;
2157 continue;
2158
2159 default:
2160 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2161 goto label_invalid_code;
2162 if (c1 >= 0x28 && c1 <= 0x2B)
2163 { /* designation of DIMENSION1_CHARS94 character set */
2164 ONE_MORE_BYTE (c2);
2165 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2166 }
2167 else if (c1 >= 0x2C && c1 <= 0x2F)
2168 { /* designation of DIMENSION1_CHARS96 character set */
2169 ONE_MORE_BYTE (c2);
2170 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2171 }
2172 else
2173 goto label_invalid_code;
2174 /* We must update these variables now. */
2175 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2176 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2177 continue;
2178 }
2179 }
2180
2181 /* Now we know CHARSET and 1st position code C1 of a character.
2182 Produce a multibyte sequence for that character while getting
2183 2nd position code C2 if necessary. */
2184 if (CHARSET_DIMENSION (charset) == 2)
2185 {
2186 ONE_MORE_BYTE (c2);
2187 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2188 /* C2 is not in a valid range. */
2189 goto label_invalid_code;
2190 }
2191 c = DECODE_ISO_CHARACTER (charset, c1, c2);
2192 EMIT_CHAR (c);
2193 continue;
2194
2195 label_invalid_code:
2196 coding->errors++;
2197 if (COMPOSING_P (coding))
2198 DECODE_COMPOSITION_END ('1');
2199 src = src_base;
2200 c = *src++;
2201 EMIT_CHAR (c);
2202 }
2203
2204 label_end_of_loop:
2205 coding->consumed = coding->consumed_char = src_base - source;
2206 coding->produced = dst - destination;
2207 return;
2208 }
2209
2210
2211 /* ISO2022 encoding stuff. */
2212
2213 /*
2214 It is not enough to say just "ISO2022" on encoding, we have to
2215 specify more details. In Emacs, each ISO2022 coding system
2216 variant has the following specifications:
2217 1. Initial designation to G0 through G3.
2218 2. Allows short-form designation?
2219 3. ASCII should be designated to G0 before control characters?
2220 4. ASCII should be designated to G0 at end of line?
2221 5. 7-bit environment or 8-bit environment?
2222 6. Use locking-shift?
2223 7. Use Single-shift?
2224 And the following two are only for Japanese:
2225 8. Use ASCII in place of JIS0201-1976-Roman?
2226 9. Use JISX0208-1983 in place of JISX0208-1978?
2227 These specifications are encoded in `coding->flags' as flag bits
2228 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
2229 details.
2230 */
2231
2232 /* Produce codes (escape sequence) for designating CHARSET to graphic
2233 register REG at DST, and increment DST. If <final-char> of CHARSET is
2234 '@', 'A', or 'B' and the coding system CODING allows, produce
2235 designation sequence of short-form. */
2236
2237 #define ENCODE_DESIGNATION(charset, reg, coding) \
2238 do { \
2239 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
2240 char *intermediate_char_94 = "()*+"; \
2241 char *intermediate_char_96 = ",-./"; \
2242 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
2243 \
2244 if (revision < 255) \
2245 { \
2246 *dst++ = ISO_CODE_ESC; \
2247 *dst++ = '&'; \
2248 *dst++ = '@' + revision; \
2249 } \
2250 *dst++ = ISO_CODE_ESC; \
2251 if (CHARSET_DIMENSION (charset) == 1) \
2252 { \
2253 if (CHARSET_CHARS (charset) == 94) \
2254 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2255 else \
2256 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2257 } \
2258 else \
2259 { \
2260 *dst++ = '$'; \
2261 if (CHARSET_CHARS (charset) == 94) \
2262 { \
2263 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
2264 || reg != 0 \
2265 || final_char < '@' || final_char > 'B') \
2266 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
2267 } \
2268 else \
2269 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
2270 } \
2271 *dst++ = final_char; \
2272 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
2273 } while (0)
2274
2275 /* The following two macros produce codes (control character or escape
2276 sequence) for ISO2022 single-shift functions (single-shift-2 and
2277 single-shift-3). */
2278
2279 #define ENCODE_SINGLE_SHIFT_2 \
2280 do { \
2281 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2282 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
2283 else \
2284 *dst++ = ISO_CODE_SS2; \
2285 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2286 } while (0)
2287
2288 #define ENCODE_SINGLE_SHIFT_3 \
2289 do { \
2290 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2291 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
2292 else \
2293 *dst++ = ISO_CODE_SS3; \
2294 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
2295 } while (0)
2296
2297 /* The following four macros produce codes (control character or
2298 escape sequence) for ISO2022 locking-shift functions (shift-in,
2299 shift-out, locking-shift-2, and locking-shift-3). */
2300
2301 #define ENCODE_SHIFT_IN \
2302 do { \
2303 *dst++ = ISO_CODE_SI; \
2304 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2305 } while (0)
2306
2307 #define ENCODE_SHIFT_OUT \
2308 do { \
2309 *dst++ = ISO_CODE_SO; \
2310 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2311 } while (0)
2312
2313 #define ENCODE_LOCKING_SHIFT_2 \
2314 do { \
2315 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
2316 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2317 } while (0)
2318
2319 #define ENCODE_LOCKING_SHIFT_3 \
2320 do { \
2321 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
2322 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2323 } while (0)
2324
2325 /* Produce codes for a DIMENSION1 character whose character set is
2326 CHARSET and whose position-code is C1. Designation and invocation
2327 sequences are also produced in advance if necessary. */
2328
2329 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
2330 do { \
2331 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2332 { \
2333 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2334 *dst++ = c1 & 0x7F; \
2335 else \
2336 *dst++ = c1 | 0x80; \
2337 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2338 break; \
2339 } \
2340 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2341 { \
2342 *dst++ = c1 & 0x7F; \
2343 break; \
2344 } \
2345 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2346 { \
2347 *dst++ = c1 | 0x80; \
2348 break; \
2349 } \
2350 else \
2351 /* Since CHARSET is not yet invoked to any graphic planes, we \
2352 must invoke it, or, at first, designate it to some graphic \
2353 register. Then repeat the loop to actually produce the \
2354 character. */ \
2355 dst = encode_invocation_designation (charset, coding, dst); \
2356 } while (1)
2357
2358 /* Produce codes for a DIMENSION2 character whose character set is
2359 CHARSET and whose position-codes are C1 and C2. Designation and
2360 invocation codes are also produced in advance if necessary. */
2361
2362 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
2363 do { \
2364 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
2365 { \
2366 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
2367 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
2368 else \
2369 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
2370 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
2371 break; \
2372 } \
2373 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
2374 { \
2375 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
2376 break; \
2377 } \
2378 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
2379 { \
2380 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
2381 break; \
2382 } \
2383 else \
2384 /* Since CHARSET is not yet invoked to any graphic planes, we \
2385 must invoke it, or, at first, designate it to some graphic \
2386 register. Then repeat the loop to actually produce the \
2387 character. */ \
2388 dst = encode_invocation_designation (charset, coding, dst); \
2389 } while (1)
2390
2391 #define ENCODE_ISO_CHARACTER(c) \
2392 do { \
2393 int charset, c1, c2; \
2394 \
2395 SPLIT_CHAR (c, charset, c1, c2); \
2396 if (CHARSET_DEFINED_P (charset)) \
2397 { \
2398 if (CHARSET_DIMENSION (charset) == 1) \
2399 { \
2400 if (charset == CHARSET_ASCII \
2401 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
2402 charset = charset_latin_jisx0201; \
2403 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \
2404 } \
2405 else \
2406 { \
2407 if (charset == charset_jisx0208 \
2408 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
2409 charset = charset_jisx0208_1978; \
2410 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \
2411 } \
2412 } \
2413 else \
2414 { \
2415 *dst++ = c1; \
2416 if (c2 >= 0) \
2417 *dst++ = c2; \
2418 } \
2419 } while (0)
2420
2421
2422 /* Instead of encoding character C, produce one or two `?'s. */
2423
2424 #define ENCODE_UNSAFE_CHARACTER(c) \
2425 do { \
2426 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2427 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
2428 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
2429 } while (0)
2430
2431
2432 /* Produce designation and invocation codes at a place pointed by DST
2433 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
2434 Return new DST. */
2435
2436 unsigned char *
2437 encode_invocation_designation (charset, coding, dst)
2438 int charset;
2439 struct coding_system *coding;
2440 unsigned char *dst;
2441 {
2442 int reg; /* graphic register number */
2443
2444 /* At first, check designations. */
2445 for (reg = 0; reg < 4; reg++)
2446 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2447 break;
2448
2449 if (reg >= 4)
2450 {
2451 /* CHARSET is not yet designated to any graphic registers. */
2452 /* At first check the requested designation. */
2453 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2454 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2455 /* Since CHARSET requests no special designation, designate it
2456 to graphic register 0. */
2457 reg = 0;
2458
2459 ENCODE_DESIGNATION (charset, reg, coding);
2460 }
2461
2462 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2463 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2464 {
2465 /* Since the graphic register REG is not invoked to any graphic
2466 planes, invoke it to graphic plane 0. */
2467 switch (reg)
2468 {
2469 case 0: /* graphic register 0 */
2470 ENCODE_SHIFT_IN;
2471 break;
2472
2473 case 1: /* graphic register 1 */
2474 ENCODE_SHIFT_OUT;
2475 break;
2476
2477 case 2: /* graphic register 2 */
2478 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2479 ENCODE_SINGLE_SHIFT_2;
2480 else
2481 ENCODE_LOCKING_SHIFT_2;
2482 break;
2483
2484 case 3: /* graphic register 3 */
2485 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2486 ENCODE_SINGLE_SHIFT_3;
2487 else
2488 ENCODE_LOCKING_SHIFT_3;
2489 break;
2490 }
2491 }
2492
2493 return dst;
2494 }
2495
2496 /* Produce 2-byte codes for encoded composition rule RULE. */
2497
2498 #define ENCODE_COMPOSITION_RULE(rule) \
2499 do { \
2500 int gref, nref; \
2501 COMPOSITION_DECODE_RULE (rule, gref, nref); \
2502 *dst++ = 32 + 81 + gref; \
2503 *dst++ = 32 + nref; \
2504 } while (0)
2505
2506 /* Produce codes for indicating the start of a composition sequence
2507 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
2508 which specify information about the composition. See the comment
2509 in coding.h for the format of DATA. */
2510
2511 #define ENCODE_COMPOSITION_START(coding, data) \
2512 do { \
2513 coding->composing = data[3]; \
2514 *dst++ = ISO_CODE_ESC; \
2515 if (coding->composing == COMPOSITION_RELATIVE) \
2516 *dst++ = '0'; \
2517 else \
2518 { \
2519 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
2520 ? '3' : '4'); \
2521 coding->cmp_data_index = coding->cmp_data_start + 4; \
2522 coding->composition_rule_follows = 0; \
2523 } \
2524 } while (0)
2525
2526 /* Produce codes for indicating the end of the current composition. */
2527
2528 #define ENCODE_COMPOSITION_END(coding, data) \
2529 do { \
2530 *dst++ = ISO_CODE_ESC; \
2531 *dst++ = '1'; \
2532 coding->cmp_data_start += data[0]; \
2533 coding->composing = COMPOSITION_NO; \
2534 if (coding->cmp_data_start == coding->cmp_data->used \
2535 && coding->cmp_data->next) \
2536 { \
2537 coding->cmp_data = coding->cmp_data->next; \
2538 coding->cmp_data_start = 0; \
2539 } \
2540 } while (0)
2541
2542 /* Produce composition start sequence ESC 0. Here, this sequence
2543 doesn't mean the start of a new composition but means that we have
2544 just produced components (alternate chars and composition rules) of
2545 the composition and the actual text follows in SRC. */
2546
2547 #define ENCODE_COMPOSITION_FAKE_START(coding) \
2548 do { \
2549 *dst++ = ISO_CODE_ESC; \
2550 *dst++ = '0'; \
2551 coding->composing = COMPOSITION_RELATIVE; \
2552 } while (0)
2553
2554 /* The following three macros produce codes for indicating direction
2555 of text. */
2556 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
2557 do { \
2558 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
2559 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
2560 else \
2561 *dst++ = ISO_CODE_CSI; \
2562 } while (0)
2563
2564 #define ENCODE_DIRECTION_R2L \
2565 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2566
2567 #define ENCODE_DIRECTION_L2R \
2568 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2569
2570 /* Produce codes for designation and invocation to reset the graphic
2571 planes and registers to initial state. */
2572 #define ENCODE_RESET_PLANE_AND_REGISTER \
2573 do { \
2574 int reg; \
2575 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
2576 ENCODE_SHIFT_IN; \
2577 for (reg = 0; reg < 4; reg++) \
2578 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
2579 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
2580 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
2581 ENCODE_DESIGNATION \
2582 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2583 } while (0)
2584
2585 /* Produce designation sequences of charsets in the line started from
2586 SRC to a place pointed by DST, and return updated DST.
2587
2588 If the current block ends before any end-of-line, we may fail to
2589 find all the necessary designations. */
2590
2591 static unsigned char *
2592 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2593 struct coding_system *coding;
2594 Lisp_Object translation_table;
2595 unsigned char *src, *src_end, *dst;
2596 {
2597 int charset, c, found = 0, reg;
2598 /* Table of charsets to be designated to each graphic register. */
2599 int r[4];
2600
2601 for (reg = 0; reg < 4; reg++)
2602 r[reg] = -1;
2603
2604 while (found < 4)
2605 {
2606 ONE_MORE_CHAR (c);
2607 if (c == '\n')
2608 break;
2609
2610 charset = CHAR_CHARSET (c);
2611 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2612 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2613 {
2614 found++;
2615 r[reg] = charset;
2616 }
2617 }
2618
2619 label_end_of_loop:
2620 if (found)
2621 {
2622 for (reg = 0; reg < 4; reg++)
2623 if (r[reg] >= 0
2624 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2625 ENCODE_DESIGNATION (r[reg], reg, coding);
2626 }
2627
2628 return dst;
2629 }
2630
2631 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
2632
2633 static void
2634 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2635 struct coding_system *coding;
2636 unsigned char *source, *destination;
2637 int src_bytes, dst_bytes;
2638 {
2639 unsigned char *src = source;
2640 unsigned char *src_end = source + src_bytes;
2641 unsigned char *dst = destination;
2642 unsigned char *dst_end = destination + dst_bytes;
2643 /* Since the maximum bytes produced by each loop is 20, we subtract 19
2644 from DST_END to assure overflow checking is necessary only at the
2645 head of loop. */
2646 unsigned char *adjusted_dst_end = dst_end - 19;
2647 /* SRC_BASE remembers the start position in source in each loop.
2648 The loop will be exited when there's not enough source text to
2649 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2650 there's not enough destination area to produce encoded codes
2651 (within macro EMIT_BYTES). */
2652 unsigned char *src_base;
2653 int c;
2654 Lisp_Object translation_table;
2655 Lisp_Object safe_chars;
2656
2657 if (coding->flags & CODING_FLAG_ISO_SAFE)
2658 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2659
2660 safe_chars = coding_safe_chars (coding->symbol);
2661
2662 if (NILP (Venable_character_translation))
2663 translation_table = Qnil;
2664 else
2665 {
2666 translation_table = coding->translation_table_for_encode;
2667 if (NILP (translation_table))
2668 translation_table = Vstandard_translation_table_for_encode;
2669 }
2670
2671 coding->consumed_char = 0;
2672 coding->errors = 0;
2673 while (1)
2674 {
2675 src_base = src;
2676
2677 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2678 {
2679 coding->result = CODING_FINISH_INSUFFICIENT_DST;
2680 break;
2681 }
2682
2683 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2684 && CODING_SPEC_ISO_BOL (coding))
2685 {
2686 /* We have to produce designation sequences if any now. */
2687 dst = encode_designation_at_bol (coding, translation_table,
2688 src, src_end, dst);
2689 CODING_SPEC_ISO_BOL (coding) = 0;
2690 }
2691
2692 /* Check composition start and end. */
2693 if (coding->composing != COMPOSITION_DISABLED
2694 && coding->cmp_data_start < coding->cmp_data->used)
2695 {
2696 struct composition_data *cmp_data = coding->cmp_data;
2697 int *data = cmp_data->data + coding->cmp_data_start;
2698 int this_pos = cmp_data->char_offset + coding->consumed_char;
2699
2700 if (coding->composing == COMPOSITION_RELATIVE)
2701 {
2702 if (this_pos == data[2])
2703 {
2704 ENCODE_COMPOSITION_END (coding, data);
2705 cmp_data = coding->cmp_data;
2706 data = cmp_data->data + coding->cmp_data_start;
2707 }
2708 }
2709 else if (COMPOSING_P (coding))
2710 {
2711 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2712 if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2713 /* We have consumed components of the composition.
2714 What follows in SRC is the composition's base
2715 text. */
2716 ENCODE_COMPOSITION_FAKE_START (coding);
2717 else
2718 {
2719 int c = cmp_data->data[coding->cmp_data_index++];
2720 if (coding->composition_rule_follows)
2721 {
2722 ENCODE_COMPOSITION_RULE (c);
2723 coding->composition_rule_follows = 0;
2724 }
2725 else
2726 {
2727 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2728 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2729 ENCODE_UNSAFE_CHARACTER (c);
2730 else
2731 ENCODE_ISO_CHARACTER (c);
2732 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2733 coding->composition_rule_follows = 1;
2734 }
2735 continue;
2736 }
2737 }
2738 if (!COMPOSING_P (coding))
2739 {
2740 if (this_pos == data[1])
2741 {
2742 ENCODE_COMPOSITION_START (coding, data);
2743 continue;
2744 }
2745 }
2746 }
2747
2748 ONE_MORE_CHAR (c);
2749
2750 /* Now encode the character C. */
2751 if (c < 0x20 || c == 0x7F)
2752 {
2753 if (c == '\r')
2754 {
2755 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2756 {
2757 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2758 ENCODE_RESET_PLANE_AND_REGISTER;
2759 *dst++ = c;
2760 continue;
2761 }
2762 /* fall down to treat '\r' as '\n' ... */
2763 c = '\n';
2764 }
2765 if (c == '\n')
2766 {
2767 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2768 ENCODE_RESET_PLANE_AND_REGISTER;
2769 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2770 bcopy (coding->spec.iso2022.initial_designation,
2771 coding->spec.iso2022.current_designation,
2772 sizeof coding->spec.iso2022.initial_designation);
2773 if (coding->eol_type == CODING_EOL_LF
2774 || coding->eol_type == CODING_EOL_UNDECIDED)
2775 *dst++ = ISO_CODE_LF;
2776 else if (coding->eol_type == CODING_EOL_CRLF)
2777 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2778 else
2779 *dst++ = ISO_CODE_CR;
2780 CODING_SPEC_ISO_BOL (coding) = 1;
2781 }
2782 else
2783 {
2784 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2785 ENCODE_RESET_PLANE_AND_REGISTER;
2786 *dst++ = c;
2787 }
2788 }
2789 else if (ASCII_BYTE_P (c))
2790 ENCODE_ISO_CHARACTER (c);
2791 else if (SINGLE_BYTE_CHAR_P (c))
2792 {
2793 *dst++ = c;
2794 coding->errors++;
2795 }
2796 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2797 && ! CODING_SAFE_CHAR_P (safe_chars, c))
2798 ENCODE_UNSAFE_CHARACTER (c);
2799 else
2800 ENCODE_ISO_CHARACTER (c);
2801
2802 coding->consumed_char++;
2803 }
2804
2805 label_end_of_loop:
2806 coding->consumed = src_base - source;
2807 coding->produced = coding->produced_char = dst - destination;
2808 }
2809
2810 \f
2811 /*** 4. SJIS and BIG5 handlers ***/
2812
2813 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2814 quite widely. So, for the moment, Emacs supports them in the bare
2815 C code. But, in the future, they may be supported only by CCL. */
2816
2817 /* SJIS is a coding system encoding three character sets: ASCII, right
2818 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2819 as is. A character of charset katakana-jisx0201 is encoded by
2820 "position-code + 0x80". A character of charset japanese-jisx0208
2821 is encoded in 2-byte but two position-codes are divided and shifted
2822 so that it fits in the range below.
2823
2824 --- CODE RANGE of SJIS ---
2825 (character set) (range)
2826 ASCII 0x00 .. 0x7F
2827 KATAKANA-JISX0201 0xA1 .. 0xDF
2828 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2829 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2830 -------------------------------
2831
2832 */
2833
2834 /* BIG5 is a coding system encoding two character sets: ASCII and
2835 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2836 character set and is encoded in two bytes.
2837
2838 --- CODE RANGE of BIG5 ---
2839 (character set) (range)
2840 ASCII 0x00 .. 0x7F
2841 Big5 (1st byte) 0xA1 .. 0xFE
2842 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2843 --------------------------
2844
2845 Since the number of characters in Big5 is larger than maximum
2846 characters in Emacs' charset (96x96), it can't be handled as one
2847 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2848 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2849 contains frequently used characters and the latter contains less
2850 frequently used characters. */
2851
2852 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2853 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2854 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2855 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2856
2857 /* Number of Big5 characters which have the same code in 1st byte. */
2858 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2859
2860 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2861 do { \
2862 unsigned int temp \
2863 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2864 if (b1 < 0xC9) \
2865 charset = charset_big5_1; \
2866 else \
2867 { \
2868 charset = charset_big5_2; \
2869 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2870 } \
2871 c1 = temp / (0xFF - 0xA1) + 0x21; \
2872 c2 = temp % (0xFF - 0xA1) + 0x21; \
2873 } while (0)
2874
2875 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2876 do { \
2877 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2878 if (charset == charset_big5_2) \
2879 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2880 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2881 b2 = temp % BIG5_SAME_ROW; \
2882 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2883 } while (0)
2884
2885 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2886 Check if a text is encoded in SJIS. If it is, return
2887 CODING_CATEGORY_MASK_SJIS, else return 0. */
2888
2889 static int
2890 detect_coding_sjis (src, src_end, multibytep)
2891 unsigned char *src, *src_end;
2892 int multibytep;
2893 {
2894 int c;
2895 /* Dummy for ONE_MORE_BYTE. */
2896 struct coding_system dummy_coding;
2897 struct coding_system *coding = &dummy_coding;
2898
2899 while (1)
2900 {
2901 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2902 if (c < 0x80)
2903 continue;
2904 if (c == 0x80 || c == 0xA0 || c > 0xEF)
2905 return 0;
2906 if (c <= 0x9F || c >= 0xE0)
2907 {
2908 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2909 if (c < 0x40 || c == 0x7F || c > 0xFC)
2910 return 0;
2911 }
2912 }
2913 label_end_of_loop:
2914 return CODING_CATEGORY_MASK_SJIS;
2915 }
2916
2917 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2918 Check if a text is encoded in BIG5. If it is, return
2919 CODING_CATEGORY_MASK_BIG5, else return 0. */
2920
2921 static int
2922 detect_coding_big5 (src, src_end, multibytep)
2923 unsigned char *src, *src_end;
2924 int multibytep;
2925 {
2926 int c;
2927 /* Dummy for ONE_MORE_BYTE. */
2928 struct coding_system dummy_coding;
2929 struct coding_system *coding = &dummy_coding;
2930
2931 while (1)
2932 {
2933 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2934 if (c < 0x80)
2935 continue;
2936 if (c < 0xA1 || c > 0xFE)
2937 return 0;
2938 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2939 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2940 return 0;
2941 }
2942 label_end_of_loop:
2943 return CODING_CATEGORY_MASK_BIG5;
2944 }
2945
2946 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2947 Check if a text is encoded in UTF-8. If it is, return
2948 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2949
2950 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2951 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2952 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2953 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2954 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2955 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2956 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2957
2958 static int
2959 detect_coding_utf_8 (src, src_end, multibytep)
2960 unsigned char *src, *src_end;
2961 int multibytep;
2962 {
2963 unsigned char c;
2964 int seq_maybe_bytes;
2965 /* Dummy for ONE_MORE_BYTE. */
2966 struct coding_system dummy_coding;
2967 struct coding_system *coding = &dummy_coding;
2968
2969 while (1)
2970 {
2971 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2972 if (UTF_8_1_OCTET_P (c))
2973 continue;
2974 else if (UTF_8_2_OCTET_LEADING_P (c))
2975 seq_maybe_bytes = 1;
2976 else if (UTF_8_3_OCTET_LEADING_P (c))
2977 seq_maybe_bytes = 2;
2978 else if (UTF_8_4_OCTET_LEADING_P (c))
2979 seq_maybe_bytes = 3;
2980 else if (UTF_8_5_OCTET_LEADING_P (c))
2981 seq_maybe_bytes = 4;
2982 else if (UTF_8_6_OCTET_LEADING_P (c))
2983 seq_maybe_bytes = 5;
2984 else
2985 return 0;
2986
2987 do
2988 {
2989 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2990 if (!UTF_8_EXTRA_OCTET_P (c))
2991 return 0;
2992 seq_maybe_bytes--;
2993 }
2994 while (seq_maybe_bytes > 0);
2995 }
2996
2997 label_end_of_loop:
2998 return CODING_CATEGORY_MASK_UTF_8;
2999 }
3000
3001 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3002 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3003 Little Endian (otherwise). If it is, return
3004 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3005 else return 0. */
3006
3007 #define UTF_16_INVALID_P(val) \
3008 (((val) == 0xFFFE) \
3009 || ((val) == 0xFFFF))
3010
3011 #define UTF_16_HIGH_SURROGATE_P(val) \
3012 (((val) & 0xD800) == 0xD800)
3013
3014 #define UTF_16_LOW_SURROGATE_P(val) \
3015 (((val) & 0xDC00) == 0xDC00)
3016
3017 static int
3018 detect_coding_utf_16 (src, src_end, multibytep)
3019 unsigned char *src, *src_end;
3020 int multibytep;
3021 {
3022 unsigned char c1, c2;
3023 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */
3024 struct coding_system dummy_coding;
3025 struct coding_system *coding = &dummy_coding;
3026
3027 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3028 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3029
3030 if ((c1 == 0xFF) && (c2 == 0xFE))
3031 return CODING_CATEGORY_MASK_UTF_16_LE;
3032 else if ((c1 == 0xFE) && (c2 == 0xFF))
3033 return CODING_CATEGORY_MASK_UTF_16_BE;
3034
3035 label_end_of_loop:
3036 return 0;
3037 }
3038
3039 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3040 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3041
3042 static void
3043 decode_coding_sjis_big5 (coding, source, destination,
3044 src_bytes, dst_bytes, sjis_p)
3045 struct coding_system *coding;
3046 unsigned char *source, *destination;
3047 int src_bytes, dst_bytes;
3048 int sjis_p;
3049 {
3050 unsigned char *src = source;
3051 unsigned char *src_end = source + src_bytes;
3052 unsigned char *dst = destination;
3053 unsigned char *dst_end = destination + dst_bytes;
3054 /* SRC_BASE remembers the start position in source in each loop.
3055 The loop will be exited when there's not enough source code
3056 (within macro ONE_MORE_BYTE), or when there's not enough
3057 destination area to produce a character (within macro
3058 EMIT_CHAR). */
3059 unsigned char *src_base;
3060 Lisp_Object translation_table;
3061
3062 if (NILP (Venable_character_translation))
3063 translation_table = Qnil;
3064 else
3065 {
3066 translation_table = coding->translation_table_for_decode;
3067 if (NILP (translation_table))
3068 translation_table = Vstandard_translation_table_for_decode;
3069 }
3070
3071 coding->produced_char = 0;
3072 while (1)
3073 {
3074 int c, charset, c1, c2 = 0;
3075
3076 src_base = src;
3077 ONE_MORE_BYTE (c1);
3078
3079 if (c1 < 0x80)
3080 {
3081 charset = CHARSET_ASCII;
3082 if (c1 < 0x20)
3083 {
3084 if (c1 == '\r')
3085 {
3086 if (coding->eol_type == CODING_EOL_CRLF)
3087 {
3088 ONE_MORE_BYTE (c2);
3089 if (c2 == '\n')
3090 c1 = c2;
3091 else
3092 /* To process C2 again, SRC is subtracted by 1. */
3093 src--;
3094 }
3095 else if (coding->eol_type == CODING_EOL_CR)
3096 c1 = '\n';
3097 }
3098 else if (c1 == '\n'
3099 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3100 && (coding->eol_type == CODING_EOL_CR
3101 || coding->eol_type == CODING_EOL_CRLF))
3102 {
3103 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3104 goto label_end_of_loop;
3105 }
3106 }
3107 }
3108 else
3109 {
3110 if (sjis_p)
3111 {
3112 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3113 goto label_invalid_code;
3114 if (c1 <= 0x9F || c1 >= 0xE0)
3115 {
3116 /* SJIS -> JISX0208 */
3117 ONE_MORE_BYTE (c2);
3118 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3119 goto label_invalid_code;
3120 DECODE_SJIS (c1, c2, c1, c2);
3121 charset = charset_jisx0208;
3122 }
3123 else
3124 /* SJIS -> JISX0201-Kana */
3125 charset = charset_katakana_jisx0201;
3126 }
3127 else
3128 {
3129 /* BIG5 -> Big5 */
3130 if (c1 < 0xA0 || c1 > 0xFE)
3131 goto label_invalid_code;
3132 ONE_MORE_BYTE (c2);
3133 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3134 goto label_invalid_code;
3135 DECODE_BIG5 (c1, c2, charset, c1, c2);
3136 }
3137 }
3138
3139 c = DECODE_ISO_CHARACTER (charset, c1, c2);
3140 EMIT_CHAR (c);
3141 continue;
3142
3143 label_invalid_code:
3144 coding->errors++;
3145 src = src_base;
3146 c = *src++;
3147 EMIT_CHAR (c);
3148 }
3149
3150 label_end_of_loop:
3151 coding->consumed = coding->consumed_char = src_base - source;
3152 coding->produced = dst - destination;
3153 return;
3154 }
3155
3156 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3157 This function can encode charsets `ascii', `katakana-jisx0201',
3158 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3159 are sure that all these charsets are registered as official charset
3160 (i.e. do not have extended leading-codes). Characters of other
3161 charsets are produced without any encoding. If SJIS_P is 1, encode
3162 SJIS text, else encode BIG5 text. */
3163
3164 static void
3165 encode_coding_sjis_big5 (coding, source, destination,
3166 src_bytes, dst_bytes, sjis_p)
3167 struct coding_system *coding;
3168 unsigned char *source, *destination;
3169 int src_bytes, dst_bytes;
3170 int sjis_p;
3171 {
3172 unsigned char *src = source;
3173 unsigned char *src_end = source + src_bytes;
3174 unsigned char *dst = destination;
3175 unsigned char *dst_end = destination + dst_bytes;
3176 /* SRC_BASE remembers the start position in source in each loop.
3177 The loop will be exited when there's not enough source text to
3178 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3179 there's not enough destination area to produce encoded codes
3180 (within macro EMIT_BYTES). */
3181 unsigned char *src_base;
3182 Lisp_Object translation_table;
3183
3184 if (NILP (Venable_character_translation))
3185 translation_table = Qnil;
3186 else
3187 {
3188 translation_table = coding->translation_table_for_encode;
3189 if (NILP (translation_table))
3190 translation_table = Vstandard_translation_table_for_encode;
3191 }
3192
3193 while (1)
3194 {
3195 int c, charset, c1, c2;
3196
3197 src_base = src;
3198 ONE_MORE_CHAR (c);
3199
3200 /* Now encode the character C. */
3201 if (SINGLE_BYTE_CHAR_P (c))
3202 {
3203 switch (c)
3204 {
3205 case '\r':
3206 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3207 {
3208 EMIT_ONE_BYTE (c);
3209 break;
3210 }
3211 c = '\n';
3212 case '\n':
3213 if (coding->eol_type == CODING_EOL_CRLF)
3214 {
3215 EMIT_TWO_BYTES ('\r', c);
3216 break;
3217 }
3218 else if (coding->eol_type == CODING_EOL_CR)
3219 c = '\r';
3220 default:
3221 EMIT_ONE_BYTE (c);
3222 }
3223 }
3224 else
3225 {
3226 SPLIT_CHAR (c, charset, c1, c2);
3227 if (sjis_p)
3228 {
3229 if (charset == charset_jisx0208
3230 || charset == charset_jisx0208_1978)
3231 {
3232 ENCODE_SJIS (c1, c2, c1, c2);
3233 EMIT_TWO_BYTES (c1, c2);
3234 }
3235 else if (charset == charset_katakana_jisx0201)
3236 EMIT_ONE_BYTE (c1 | 0x80);
3237 else if (charset == charset_latin_jisx0201)
3238 EMIT_ONE_BYTE (c1);
3239 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3240 {
3241 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3242 if (CHARSET_WIDTH (charset) > 1)
3243 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3244 }
3245 else
3246 /* There's no way other than producing the internal
3247 codes as is. */
3248 EMIT_BYTES (src_base, src);
3249 }
3250 else
3251 {
3252 if (charset == charset_big5_1 || charset == charset_big5_2)
3253 {
3254 ENCODE_BIG5 (charset, c1, c2, c1, c2);
3255 EMIT_TWO_BYTES (c1, c2);
3256 }
3257 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3258 {
3259 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260 if (CHARSET_WIDTH (charset) > 1)
3261 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3262 }
3263 else
3264 /* There's no way other than producing the internal
3265 codes as is. */
3266 EMIT_BYTES (src_base, src);
3267 }
3268 }
3269 coding->consumed_char++;
3270 }
3271
3272 label_end_of_loop:
3273 coding->consumed = src_base - source;
3274 coding->produced = coding->produced_char = dst - destination;
3275 }
3276
3277 \f
3278 /*** 5. CCL handlers ***/
3279
3280 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3281 Check if a text is encoded in a coding system of which
3282 encoder/decoder are written in CCL program. If it is, return
3283 CODING_CATEGORY_MASK_CCL, else return 0. */
3284
3285 static int
3286 detect_coding_ccl (src, src_end, multibytep)
3287 unsigned char *src, *src_end;
3288 int multibytep;
3289 {
3290 unsigned char *valid;
3291 int c;
3292 /* Dummy for ONE_MORE_BYTE. */
3293 struct coding_system dummy_coding;
3294 struct coding_system *coding = &dummy_coding;
3295
3296 /* No coding system is assigned to coding-category-ccl. */
3297 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3298 return 0;
3299
3300 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3301 while (1)
3302 {
3303 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3304 if (! valid[c])
3305 return 0;
3306 }
3307 label_end_of_loop:
3308 return CODING_CATEGORY_MASK_CCL;
3309 }
3310
3311 \f
3312 /*** 6. End-of-line handlers ***/
3313
3314 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3315
3316 static void
3317 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3318 struct coding_system *coding;
3319 unsigned char *source, *destination;
3320 int src_bytes, dst_bytes;
3321 {
3322 unsigned char *src = source;
3323 unsigned char *dst = destination;
3324 unsigned char *src_end = src + src_bytes;
3325 unsigned char *dst_end = dst + dst_bytes;
3326 Lisp_Object translation_table;
3327 /* SRC_BASE remembers the start position in source in each loop.
3328 The loop will be exited when there's not enough source code
3329 (within macro ONE_MORE_BYTE), or when there's not enough
3330 destination area to produce a character (within macro
3331 EMIT_CHAR). */
3332 unsigned char *src_base;
3333 int c;
3334
3335 translation_table = Qnil;
3336 switch (coding->eol_type)
3337 {
3338 case CODING_EOL_CRLF:
3339 while (1)
3340 {
3341 src_base = src;
3342 ONE_MORE_BYTE (c);
3343 if (c == '\r')
3344 {
3345 ONE_MORE_BYTE (c);
3346 if (c != '\n')
3347 {
3348 src--;
3349 c = '\r';
3350 }
3351 }
3352 else if (c == '\n'
3353 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3354 {
3355 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3356 goto label_end_of_loop;
3357 }
3358 EMIT_CHAR (c);
3359 }
3360 break;
3361
3362 case CODING_EOL_CR:
3363 while (1)
3364 {
3365 src_base = src;
3366 ONE_MORE_BYTE (c);
3367 if (c == '\n')
3368 {
3369 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3370 {
3371 coding->result = CODING_FINISH_INCONSISTENT_EOL;
3372 goto label_end_of_loop;
3373 }
3374 }
3375 else if (c == '\r')
3376 c = '\n';
3377 EMIT_CHAR (c);
3378 }
3379 break;
3380
3381 default: /* no need for EOL handling */
3382 while (1)
3383 {
3384 src_base = src;
3385 ONE_MORE_BYTE (c);
3386 EMIT_CHAR (c);
3387 }
3388 }
3389
3390 label_end_of_loop:
3391 coding->consumed = coding->consumed_char = src_base - source;
3392 coding->produced = dst - destination;
3393 return;
3394 }
3395
3396 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
3397 format of end-of-line according to `coding->eol_type'. It also
3398 convert multibyte form 8-bit characters to unibyte if
3399 CODING->src_multibyte is nonzero. If `coding->mode &
3400 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3401 also means end-of-line. */
3402
3403 static void
3404 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3405 struct coding_system *coding;
3406 const unsigned char *source;
3407 unsigned char *destination;
3408 int src_bytes, dst_bytes;
3409 {
3410 const unsigned char *src = source;
3411 unsigned char *dst = destination;
3412 const unsigned char *src_end = src + src_bytes;
3413 unsigned char *dst_end = dst + dst_bytes;
3414 Lisp_Object translation_table;
3415 /* SRC_BASE remembers the start position in source in each loop.
3416 The loop will be exited when there's not enough source text to
3417 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3418 there's not enough destination area to produce encoded codes
3419 (within macro EMIT_BYTES). */
3420 const unsigned char *src_base;
3421 unsigned char *tmp;
3422 int c;
3423 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3424
3425 translation_table = Qnil;
3426 if (coding->src_multibyte
3427 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3428 {
3429 src_end--;
3430 src_bytes--;
3431 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3432 }
3433
3434 if (coding->eol_type == CODING_EOL_CRLF)
3435 {
3436 while (src < src_end)
3437 {
3438 src_base = src;
3439 c = *src++;
3440 if (c >= 0x20)
3441 EMIT_ONE_BYTE (c);
3442 else if (c == '\n' || (c == '\r' && selective_display))
3443 EMIT_TWO_BYTES ('\r', '\n');
3444 else
3445 EMIT_ONE_BYTE (c);
3446 }
3447 src_base = src;
3448 label_end_of_loop:
3449 ;
3450 }
3451 else
3452 {
3453 if (!dst_bytes || src_bytes <= dst_bytes)
3454 {
3455 safe_bcopy (src, dst, src_bytes);
3456 src_base = src_end;
3457 dst += src_bytes;
3458 }
3459 else
3460 {
3461 if (coding->src_multibyte
3462 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3463 dst_bytes--;
3464 safe_bcopy (src, dst, dst_bytes);
3465 src_base = src + dst_bytes;
3466 dst = destination + dst_bytes;
3467 coding->result = CODING_FINISH_INSUFFICIENT_DST;
3468 }
3469 if (coding->eol_type == CODING_EOL_CR)
3470 {
3471 for (tmp = destination; tmp < dst; tmp++)
3472 if (*tmp == '\n') *tmp = '\r';
3473 }
3474 else if (selective_display)
3475 {
3476 for (tmp = destination; tmp < dst; tmp++)
3477 if (*tmp == '\r') *tmp = '\n';
3478 }
3479 }
3480 if (coding->src_multibyte)
3481 dst = destination + str_as_unibyte (destination, dst - destination);
3482
3483 coding->consumed = src_base - source;
3484 coding->produced = dst - destination;
3485 coding->produced_char = coding->produced;
3486 }
3487
3488 \f
3489 /*** 7. C library functions ***/
3490
3491 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3492 has a property `coding-system'. The value of this property is a
3493 vector of length 5 (called the coding-vector). Among elements of
3494 this vector, the first (element[0]) and the fifth (element[4])
3495 carry important information for decoding/encoding. Before
3496 decoding/encoding, this information should be set in fields of a
3497 structure of type `coding_system'.
3498
3499 The value of the property `coding-system' can be a symbol of another
3500 subsidiary coding-system. In that case, Emacs gets coding-vector
3501 from that symbol.
3502
3503 `element[0]' contains information to be set in `coding->type'. The
3504 value and its meaning is as follows:
3505
3506 0 -- coding_type_emacs_mule
3507 1 -- coding_type_sjis
3508 2 -- coding_type_iso2022
3509 3 -- coding_type_big5
3510 4 -- coding_type_ccl encoder/decoder written in CCL
3511 nil -- coding_type_no_conversion
3512 t -- coding_type_undecided (automatic conversion on decoding,
3513 no-conversion on encoding)
3514
3515 `element[4]' contains information to be set in `coding->flags' and
3516 `coding->spec'. The meaning varies by `coding->type'.
3517
3518 If `coding->type' is `coding_type_iso2022', element[4] is a vector
3519 of length 32 (of which the first 13 sub-elements are used now).
3520 Meanings of these sub-elements are:
3521
3522 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3523 If the value is an integer of valid charset, the charset is
3524 assumed to be designated to graphic register N initially.
3525
3526 If the value is minus, it is a minus value of charset which
3527 reserves graphic register N, which means that the charset is
3528 not designated initially but should be designated to graphic
3529 register N just before encoding a character in that charset.
3530
3531 If the value is nil, graphic register N is never used on
3532 encoding.
3533
3534 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3535 Each value takes t or nil. See the section ISO2022 of
3536 `coding.h' for more information.
3537
3538 If `coding->type' is `coding_type_big5', element[4] is t to denote
3539 BIG5-ETen or nil to denote BIG5-HKU.
3540
3541 If `coding->type' takes the other value, element[4] is ignored.
3542
3543 Emacs Lisp's coding systems also carry information about format of
3544 end-of-line in a value of property `eol-type'. If the value is
3545 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3546 means CODING_EOL_CR. If it is not integer, it should be a vector
3547 of subsidiary coding systems of which property `eol-type' has one
3548 of the above values.
3549
3550 */
3551
3552 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3553 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
3554 is setup so that no conversion is necessary and return -1, else
3555 return 0. */
3556
3557 int
3558 setup_coding_system (coding_system, coding)
3559 Lisp_Object coding_system;
3560 struct coding_system *coding;
3561 {
3562 Lisp_Object coding_spec, coding_type, eol_type, plist;
3563 Lisp_Object val;
3564
3565 /* At first, zero clear all members. */
3566 bzero (coding, sizeof (struct coding_system));
3567
3568 /* Initialize some fields required for all kinds of coding systems. */
3569 coding->symbol = coding_system;
3570 coding->heading_ascii = -1;
3571 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3572 coding->composing = COMPOSITION_DISABLED;
3573 coding->cmp_data = NULL;
3574
3575 if (NILP (coding_system))
3576 goto label_invalid_coding_system;
3577
3578 coding_spec = Fget (coding_system, Qcoding_system);
3579
3580 if (!VECTORP (coding_spec)
3581 || XVECTOR (coding_spec)->size != 5
3582 || !CONSP (XVECTOR (coding_spec)->contents[3]))
3583 goto label_invalid_coding_system;
3584
3585 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3586 if (VECTORP (eol_type))
3587 {
3588 coding->eol_type = CODING_EOL_UNDECIDED;
3589 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3590 }
3591 else if (XFASTINT (eol_type) == 1)
3592 {
3593 coding->eol_type = CODING_EOL_CRLF;
3594 coding->common_flags
3595 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3596 }
3597 else if (XFASTINT (eol_type) == 2)
3598 {
3599 coding->eol_type = CODING_EOL_CR;
3600 coding->common_flags
3601 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3602 }
3603 else
3604 coding->eol_type = CODING_EOL_LF;
3605
3606 coding_type = XVECTOR (coding_spec)->contents[0];
3607 /* Try short cut. */
3608 if (SYMBOLP (coding_type))
3609 {
3610 if (EQ (coding_type, Qt))
3611 {
3612 coding->type = coding_type_undecided;
3613 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3614 }
3615 else
3616 coding->type = coding_type_no_conversion;
3617 /* Initialize this member. Any thing other than
3618 CODING_CATEGORY_IDX_UTF_16_BE and
3619 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3620 special treatment in detect_eol. */
3621 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3622
3623 return 0;
3624 }
3625
3626 /* Get values of coding system properties:
3627 `post-read-conversion', `pre-write-conversion',
3628 `translation-table-for-decode', `translation-table-for-encode'. */
3629 plist = XVECTOR (coding_spec)->contents[3];
3630 /* Pre & post conversion functions should be disabled if
3631 inhibit_eol_conversion is nonzero. This is the case that a code
3632 conversion function is called while those functions are running. */
3633 if (! inhibit_pre_post_conversion)
3634 {
3635 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3636 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3637 }
3638 val = Fplist_get (plist, Qtranslation_table_for_decode);
3639 if (SYMBOLP (val))
3640 val = Fget (val, Qtranslation_table_for_decode);
3641 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3642 val = Fplist_get (plist, Qtranslation_table_for_encode);
3643 if (SYMBOLP (val))
3644 val = Fget (val, Qtranslation_table_for_encode);
3645 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3646 val = Fplist_get (plist, Qcoding_category);
3647 if (!NILP (val))
3648 {
3649 val = Fget (val, Qcoding_category_index);
3650 if (INTEGERP (val))
3651 coding->category_idx = XINT (val);
3652 else
3653 goto label_invalid_coding_system;
3654 }
3655 else
3656 goto label_invalid_coding_system;
3657
3658 /* If the coding system has non-nil `composition' property, enable
3659 composition handling. */
3660 val = Fplist_get (plist, Qcomposition);
3661 if (!NILP (val))
3662 coding->composing = COMPOSITION_NO;
3663
3664 switch (XFASTINT (coding_type))
3665 {
3666 case 0:
3667 coding->type = coding_type_emacs_mule;
3668 coding->common_flags
3669 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3670 if (!NILP (coding->post_read_conversion))
3671 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3672 if (!NILP (coding->pre_write_conversion))
3673 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3674 break;
3675
3676 case 1:
3677 coding->type = coding_type_sjis;
3678 coding->common_flags
3679 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3680 break;
3681
3682 case 2:
3683 coding->type = coding_type_iso2022;
3684 coding->common_flags
3685 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3686 {
3687 Lisp_Object val, temp;
3688 Lisp_Object *flags;
3689 int i, charset, reg_bits = 0;
3690
3691 val = XVECTOR (coding_spec)->contents[4];
3692
3693 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3694 goto label_invalid_coding_system;
3695
3696 flags = XVECTOR (val)->contents;
3697 coding->flags
3698 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3699 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3700 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3701 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3702 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3703 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3704 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3705 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3706 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3707 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3708 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3709 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3710 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3711 );
3712
3713 /* Invoke graphic register 0 to plane 0. */
3714 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3715 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3716 CODING_SPEC_ISO_INVOCATION (coding, 1)
3717 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3718 /* Not single shifting at first. */
3719 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3720 /* Beginning of buffer should also be regarded as bol. */
3721 CODING_SPEC_ISO_BOL (coding) = 1;
3722
3723 for (charset = 0; charset <= MAX_CHARSET; charset++)
3724 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3725 val = Vcharset_revision_alist;
3726 while (CONSP (val))
3727 {
3728 charset = get_charset_id (Fcar_safe (XCAR (val)));
3729 if (charset >= 0
3730 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3731 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3732 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3733 val = XCDR (val);
3734 }
3735
3736 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3737 FLAGS[REG] can be one of below:
3738 integer CHARSET: CHARSET occupies register I,
3739 t: designate nothing to REG initially, but can be used
3740 by any charsets,
3741 list of integer, nil, or t: designate the first
3742 element (if integer) to REG initially, the remaining
3743 elements (if integer) is designated to REG on request,
3744 if an element is t, REG can be used by any charsets,
3745 nil: REG is never used. */
3746 for (charset = 0; charset <= MAX_CHARSET; charset++)
3747 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3748 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3749 for (i = 0; i < 4; i++)
3750 {
3751 if ((INTEGERP (flags[i])
3752 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3753 || (charset = get_charset_id (flags[i])) >= 0)
3754 {
3755 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3756 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3757 }
3758 else if (EQ (flags[i], Qt))
3759 {
3760 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3761 reg_bits |= 1 << i;
3762 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3763 }
3764 else if (CONSP (flags[i]))
3765 {
3766 Lisp_Object tail;
3767 tail = flags[i];
3768
3769 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3770 if ((INTEGERP (XCAR (tail))
3771 && (charset = XINT (XCAR (tail)),
3772 CHARSET_VALID_P (charset)))
3773 || (charset = get_charset_id (XCAR (tail))) >= 0)
3774 {
3775 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3776 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3777 }
3778 else
3779 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3780 tail = XCDR (tail);
3781 while (CONSP (tail))
3782 {
3783 if ((INTEGERP (XCAR (tail))
3784 && (charset = XINT (XCAR (tail)),
3785 CHARSET_VALID_P (charset)))
3786 || (charset = get_charset_id (XCAR (tail))) >= 0)
3787 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3788 = i;
3789 else if (EQ (XCAR (tail), Qt))
3790 reg_bits |= 1 << i;
3791 tail = XCDR (tail);
3792 }
3793 }
3794 else
3795 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3796
3797 CODING_SPEC_ISO_DESIGNATION (coding, i)
3798 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3799 }
3800
3801 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3802 {
3803 /* REG 1 can be used only by locking shift in 7-bit env. */
3804 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3805 reg_bits &= ~2;
3806 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3807 /* Without any shifting, only REG 0 and 1 can be used. */
3808 reg_bits &= 3;
3809 }
3810
3811 if (reg_bits)
3812 for (charset = 0; charset <= MAX_CHARSET; charset++)
3813 {
3814 if (CHARSET_DEFINED_P (charset)
3815 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3816 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3817 {
3818 /* There exist some default graphic registers to be
3819 used by CHARSET. */
3820
3821 /* We had better avoid designating a charset of
3822 CHARS96 to REG 0 as far as possible. */
3823 if (CHARSET_CHARS (charset) == 96)
3824 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3825 = (reg_bits & 2
3826 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3827 else
3828 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3829 = (reg_bits & 1
3830 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3831 }
3832 }
3833 }
3834 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3835 coding->spec.iso2022.last_invalid_designation_register = -1;
3836 break;
3837
3838 case 3:
3839 coding->type = coding_type_big5;
3840 coding->common_flags
3841 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3842 coding->flags
3843 = (NILP (XVECTOR (coding_spec)->contents[4])
3844 ? CODING_FLAG_BIG5_HKU
3845 : CODING_FLAG_BIG5_ETEN);
3846 break;
3847
3848 case 4:
3849 coding->type = coding_type_ccl;
3850 coding->common_flags
3851 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3852 {
3853 val = XVECTOR (coding_spec)->contents[4];
3854 if (! CONSP (val)
3855 || setup_ccl_program (&(coding->spec.ccl.decoder),
3856 XCAR (val)) < 0
3857 || setup_ccl_program (&(coding->spec.ccl.encoder),
3858 XCDR (val)) < 0)
3859 goto label_invalid_coding_system;
3860
3861 bzero (coding->spec.ccl.valid_codes, 256);
3862 val = Fplist_get (plist, Qvalid_codes);
3863 if (CONSP (val))
3864 {
3865 Lisp_Object this;
3866
3867 for (; CONSP (val); val = XCDR (val))
3868 {
3869 this = XCAR (val);
3870 if (INTEGERP (this)
3871 && XINT (this) >= 0 && XINT (this) < 256)
3872 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3873 else if (CONSP (this)
3874 && INTEGERP (XCAR (this))
3875 && INTEGERP (XCDR (this)))
3876 {
3877 int start = XINT (XCAR (this));
3878 int end = XINT (XCDR (this));
3879
3880 if (start >= 0 && start <= end && end < 256)
3881 while (start <= end)
3882 coding->spec.ccl.valid_codes[start++] = 1;
3883 }
3884 }
3885 }
3886 }
3887 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3888 coding->spec.ccl.cr_carryover = 0;
3889 coding->spec.ccl.eight_bit_carryover[0] = 0;
3890 break;
3891
3892 case 5:
3893 coding->type = coding_type_raw_text;
3894 break;
3895
3896 default:
3897 goto label_invalid_coding_system;
3898 }
3899 return 0;
3900
3901 label_invalid_coding_system:
3902 coding->type = coding_type_no_conversion;
3903 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3904 coding->common_flags = 0;
3905 coding->eol_type = CODING_EOL_LF;
3906 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3907 return -1;
3908 }
3909
3910 /* Free memory blocks allocated for storing composition information. */
3911
3912 void
3913 coding_free_composition_data (coding)
3914 struct coding_system *coding;
3915 {
3916 struct composition_data *cmp_data = coding->cmp_data, *next;
3917
3918 if (!cmp_data)
3919 return;
3920 /* Memory blocks are chained. At first, rewind to the first, then,
3921 free blocks one by one. */
3922 while (cmp_data->prev)
3923 cmp_data = cmp_data->prev;
3924 while (cmp_data)
3925 {
3926 next = cmp_data->next;
3927 xfree (cmp_data);
3928 cmp_data = next;
3929 }
3930 coding->cmp_data = NULL;
3931 }
3932
3933 /* Set `char_offset' member of all memory blocks pointed by
3934 coding->cmp_data to POS. */
3935
3936 void
3937 coding_adjust_composition_offset (coding, pos)
3938 struct coding_system *coding;
3939 int pos;
3940 {
3941 struct composition_data *cmp_data;
3942
3943 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3944 cmp_data->char_offset = pos;
3945 }
3946
3947 /* Setup raw-text or one of its subsidiaries in the structure
3948 coding_system CODING according to the already setup value eol_type
3949 in CODING. CODING should be setup for some coding system in
3950 advance. */
3951
3952 void
3953 setup_raw_text_coding_system (coding)
3954 struct coding_system *coding;
3955 {
3956 if (coding->type != coding_type_raw_text)
3957 {
3958 coding->symbol = Qraw_text;
3959 coding->type = coding_type_raw_text;
3960 if (coding->eol_type != CODING_EOL_UNDECIDED)
3961 {
3962 Lisp_Object subsidiaries;
3963 subsidiaries = Fget (Qraw_text, Qeol_type);
3964
3965 if (VECTORP (subsidiaries)
3966 && XVECTOR (subsidiaries)->size == 3)
3967 coding->symbol
3968 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3969 }
3970 setup_coding_system (coding->symbol, coding);
3971 }
3972 return;
3973 }
3974
3975 /* Emacs has a mechanism to automatically detect a coding system if it
3976 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3977 it's impossible to distinguish some coding systems accurately
3978 because they use the same range of codes. So, at first, coding
3979 systems are categorized into 7, those are:
3980
3981 o coding-category-emacs-mule
3982
3983 The category for a coding system which has the same code range
3984 as Emacs' internal format. Assigned the coding-system (Lisp
3985 symbol) `emacs-mule' by default.
3986
3987 o coding-category-sjis
3988
3989 The category for a coding system which has the same code range
3990 as SJIS. Assigned the coding-system (Lisp
3991 symbol) `japanese-shift-jis' by default.
3992
3993 o coding-category-iso-7
3994
3995 The category for a coding system which has the same code range
3996 as ISO2022 of 7-bit environment. This doesn't use any locking
3997 shift and single shift functions. This can encode/decode all
3998 charsets. Assigned the coding-system (Lisp symbol)
3999 `iso-2022-7bit' by default.
4000
4001 o coding-category-iso-7-tight
4002
4003 Same as coding-category-iso-7 except that this can
4004 encode/decode only the specified charsets.
4005
4006 o coding-category-iso-8-1
4007
4008 The category for a coding system which has the same code range
4009 as ISO2022 of 8-bit environment and graphic plane 1 used only
4010 for DIMENSION1 charset. This doesn't use any locking shift
4011 and single shift functions. Assigned the coding-system (Lisp
4012 symbol) `iso-latin-1' by default.
4013
4014 o coding-category-iso-8-2
4015
4016 The category for a coding system which has the same code range
4017 as ISO2022 of 8-bit environment and graphic plane 1 used only
4018 for DIMENSION2 charset. This doesn't use any locking shift
4019 and single shift functions. Assigned the coding-system (Lisp
4020 symbol) `japanese-iso-8bit' by default.
4021
4022 o coding-category-iso-7-else
4023
4024 The category for a coding system which has the same code range
4025 as ISO2022 of 7-bit environment but uses locking shift or
4026 single shift functions. Assigned the coding-system (Lisp
4027 symbol) `iso-2022-7bit-lock' by default.
4028
4029 o coding-category-iso-8-else
4030
4031 The category for a coding system which has the same code range
4032 as ISO2022 of 8-bit environment but uses locking shift or
4033 single shift functions. Assigned the coding-system (Lisp
4034 symbol) `iso-2022-8bit-ss2' by default.
4035
4036 o coding-category-big5
4037
4038 The category for a coding system which has the same code range
4039 as BIG5. Assigned the coding-system (Lisp symbol)
4040 `cn-big5' by default.
4041
4042 o coding-category-utf-8
4043
4044 The category for a coding system which has the same code range
4045 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
4046 symbol) `utf-8' by default.
4047
4048 o coding-category-utf-16-be
4049
4050 The category for a coding system in which a text has an
4051 Unicode signature (cf. Unicode Standard) in the order of BIG
4052 endian at the head. Assigned the coding-system (Lisp symbol)
4053 `utf-16-be' by default.
4054
4055 o coding-category-utf-16-le
4056
4057 The category for a coding system in which a text has an
4058 Unicode signature (cf. Unicode Standard) in the order of
4059 LITTLE endian at the head. Assigned the coding-system (Lisp
4060 symbol) `utf-16-le' by default.
4061
4062 o coding-category-ccl
4063
4064 The category for a coding system of which encoder/decoder is
4065 written in CCL programs. The default value is nil, i.e., no
4066 coding system is assigned.
4067
4068 o coding-category-binary
4069
4070 The category for a coding system not categorized in any of the
4071 above. Assigned the coding-system (Lisp symbol)
4072 `no-conversion' by default.
4073
4074 Each of them is a Lisp symbol and the value is an actual
4075 `coding-system' (this is also a Lisp symbol) assigned by a user.
4076 What Emacs does actually is to detect a category of coding system.
4077 Then, it uses a `coding-system' assigned to it. If Emacs can't
4078 decide a single possible category, it selects a category of the
4079 highest priority. Priorities of categories are also specified by a
4080 user in a Lisp variable `coding-category-list'.
4081
4082 */
4083
4084 static
4085 int ascii_skip_code[256];
4086
4087 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4088 If it detects possible coding systems, return an integer in which
4089 appropriate flag bits are set. Flag bits are defined by macros
4090 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
4091 it should point the table `coding_priorities'. In that case, only
4092 the flag bit for a coding system of the highest priority is set in
4093 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the
4094 range 0x80..0x9F are in multibyte form.
4095
4096 How many ASCII characters are at the head is returned as *SKIP. */
4097
4098 static int
4099 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4100 unsigned char *source;
4101 int src_bytes, *priorities, *skip;
4102 int multibytep;
4103 {
4104 register unsigned char c;
4105 unsigned char *src = source, *src_end = source + src_bytes;
4106 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4107 int i;
4108
4109 /* At first, skip all ASCII characters and control characters except
4110 for three ISO2022 specific control characters. */
4111 ascii_skip_code[ISO_CODE_SO] = 0;
4112 ascii_skip_code[ISO_CODE_SI] = 0;
4113 ascii_skip_code[ISO_CODE_ESC] = 0;
4114
4115 label_loop_detect_coding:
4116 while (src < src_end && ascii_skip_code[*src]) src++;
4117 *skip = src - source;
4118
4119 if (src >= src_end)
4120 /* We found nothing other than ASCII. There's nothing to do. */
4121 return 0;
4122
4123 c = *src;
4124 /* The text seems to be encoded in some multilingual coding system.
4125 Now, try to find in which coding system the text is encoded. */
4126 if (c < 0x80)
4127 {
4128 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4129 /* C is an ISO2022 specific control code of C0. */
4130 mask = detect_coding_iso2022 (src, src_end, multibytep);
4131 if (mask == 0)
4132 {
4133 /* No valid ISO2022 code follows C. Try again. */
4134 src++;
4135 if (c == ISO_CODE_ESC)
4136 ascii_skip_code[ISO_CODE_ESC] = 1;
4137 else
4138 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4139 goto label_loop_detect_coding;
4140 }
4141 if (priorities)
4142 {
4143 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4144 {
4145 if (mask & priorities[i])
4146 return priorities[i];
4147 }
4148 return CODING_CATEGORY_MASK_RAW_TEXT;
4149 }
4150 }
4151 else
4152 {
4153 int try;
4154
4155 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4156 c = src[1] - 0x20;
4157
4158 if (c < 0xA0)
4159 {
4160 /* C is the first byte of SJIS character code,
4161 or a leading-code of Emacs' internal format (emacs-mule),
4162 or the first byte of UTF-16. */
4163 try = (CODING_CATEGORY_MASK_SJIS
4164 | CODING_CATEGORY_MASK_EMACS_MULE
4165 | CODING_CATEGORY_MASK_UTF_16_BE
4166 | CODING_CATEGORY_MASK_UTF_16_LE);
4167
4168 /* Or, if C is a special latin extra code,
4169 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4170 or is an ISO2022 control-sequence-introducer (CSI),
4171 we should also consider the possibility of ISO2022 codings. */
4172 if ((VECTORP (Vlatin_extra_code_table)
4173 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4174 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4175 || (c == ISO_CODE_CSI
4176 && (src < src_end
4177 && (*src == ']'
4178 || ((*src == '0' || *src == '1' || *src == '2')
4179 && src + 1 < src_end
4180 && src[1] == ']')))))
4181 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4182 | CODING_CATEGORY_MASK_ISO_8BIT);
4183 }
4184 else
4185 /* C is a character of ISO2022 in graphic plane right,
4186 or a SJIS's 1-byte character code (i.e. JISX0201),
4187 or the first byte of BIG5's 2-byte code,
4188 or the first byte of UTF-8/16. */
4189 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4190 | CODING_CATEGORY_MASK_ISO_8BIT
4191 | CODING_CATEGORY_MASK_SJIS
4192 | CODING_CATEGORY_MASK_BIG5
4193 | CODING_CATEGORY_MASK_UTF_8
4194 | CODING_CATEGORY_MASK_UTF_16_BE
4195 | CODING_CATEGORY_MASK_UTF_16_LE);
4196
4197 /* Or, we may have to consider the possibility of CCL. */
4198 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4199 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4200 ->spec.ccl.valid_codes)[c])
4201 try |= CODING_CATEGORY_MASK_CCL;
4202
4203 mask = 0;
4204 utf16_examined_p = iso2022_examined_p = 0;
4205 if (priorities)
4206 {
4207 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4208 {
4209 if (!iso2022_examined_p
4210 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4211 {
4212 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4213 iso2022_examined_p = 1;
4214 }
4215 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4216 mask |= detect_coding_sjis (src, src_end, multibytep);
4217 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4218 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4219 else if (!utf16_examined_p
4220 && (priorities[i] & try &
4221 CODING_CATEGORY_MASK_UTF_16_BE_LE))
4222 {
4223 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4224 utf16_examined_p = 1;
4225 }
4226 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4227 mask |= detect_coding_big5 (src, src_end, multibytep);
4228 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4229 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4230 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4231 mask |= detect_coding_ccl (src, src_end, multibytep);
4232 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4233 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4234 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4235 mask |= CODING_CATEGORY_MASK_BINARY;
4236 if (mask & priorities[i])
4237 return priorities[i];
4238 }
4239 return CODING_CATEGORY_MASK_RAW_TEXT;
4240 }
4241 if (try & CODING_CATEGORY_MASK_ISO)
4242 mask |= detect_coding_iso2022 (src, src_end, multibytep);
4243 if (try & CODING_CATEGORY_MASK_SJIS)
4244 mask |= detect_coding_sjis (src, src_end, multibytep);
4245 if (try & CODING_CATEGORY_MASK_BIG5)
4246 mask |= detect_coding_big5 (src, src_end, multibytep);
4247 if (try & CODING_CATEGORY_MASK_UTF_8)
4248 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4249 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4250 mask |= detect_coding_utf_16 (src, src_end, multibytep);
4251 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4252 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4253 if (try & CODING_CATEGORY_MASK_CCL)
4254 mask |= detect_coding_ccl (src, src_end, multibytep);
4255 }
4256 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4257 }
4258
4259 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4260 The information of the detected coding system is set in CODING. */
4261
4262 void
4263 detect_coding (coding, src, src_bytes)
4264 struct coding_system *coding;
4265 const unsigned char *src;
4266 int src_bytes;
4267 {
4268 unsigned int idx;
4269 int skip, mask;
4270 Lisp_Object val;
4271
4272 val = Vcoding_category_list;
4273 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4274 coding->src_multibyte);
4275 coding->heading_ascii = skip;
4276
4277 if (!mask) return;
4278
4279 /* We found a single coding system of the highest priority in MASK. */
4280 idx = 0;
4281 while (mask && ! (mask & 1)) mask >>= 1, idx++;
4282 if (! mask)
4283 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4284
4285 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4286
4287 if (coding->eol_type != CODING_EOL_UNDECIDED)
4288 {
4289 Lisp_Object tmp;
4290
4291 tmp = Fget (val, Qeol_type);
4292 if (VECTORP (tmp))
4293 val = XVECTOR (tmp)->contents[coding->eol_type];
4294 }
4295
4296 /* Setup this new coding system while preserving some slots. */
4297 {
4298 int src_multibyte = coding->src_multibyte;
4299 int dst_multibyte = coding->dst_multibyte;
4300
4301 setup_coding_system (val, coding);
4302 coding->src_multibyte = src_multibyte;
4303 coding->dst_multibyte = dst_multibyte;
4304 coding->heading_ascii = skip;
4305 }
4306 }
4307
4308 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4309 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4310 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4311
4312 How many non-eol characters are at the head is returned as *SKIP. */
4313
4314 #define MAX_EOL_CHECK_COUNT 3
4315
4316 static int
4317 detect_eol_type (source, src_bytes, skip)
4318 unsigned char *source;
4319 int src_bytes, *skip;
4320 {
4321 unsigned char *src = source, *src_end = src + src_bytes;
4322 unsigned char c;
4323 int total = 0; /* How many end-of-lines are found so far. */
4324 int eol_type = CODING_EOL_UNDECIDED;
4325 int this_eol_type;
4326
4327 *skip = 0;
4328
4329 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4330 {
4331 c = *src++;
4332 if (c == '\n' || c == '\r')
4333 {
4334 if (*skip == 0)
4335 *skip = src - 1 - source;
4336 total++;
4337 if (c == '\n')
4338 this_eol_type = CODING_EOL_LF;
4339 else if (src >= src_end || *src != '\n')
4340 this_eol_type = CODING_EOL_CR;
4341 else
4342 this_eol_type = CODING_EOL_CRLF, src++;
4343
4344 if (eol_type == CODING_EOL_UNDECIDED)
4345 /* This is the first end-of-line. */
4346 eol_type = this_eol_type;
4347 else if (eol_type != this_eol_type)
4348 {
4349 /* The found type is different from what found before. */
4350 eol_type = CODING_EOL_INCONSISTENT;
4351 break;
4352 }
4353 }
4354 }
4355
4356 if (*skip == 0)
4357 *skip = src_end - source;
4358 return eol_type;
4359 }
4360
4361 /* Like detect_eol_type, but detect EOL type in 2-octet
4362 big-endian/little-endian format for coding systems utf-16-be and
4363 utf-16-le. */
4364
4365 static int
4366 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4367 unsigned char *source;
4368 int src_bytes, *skip, big_endian_p;
4369 {
4370 unsigned char *src = source, *src_end = src + src_bytes;
4371 unsigned int c1, c2;
4372 int total = 0; /* How many end-of-lines are found so far. */
4373 int eol_type = CODING_EOL_UNDECIDED;
4374 int this_eol_type;
4375 int msb, lsb;
4376
4377 if (big_endian_p)
4378 msb = 0, lsb = 1;
4379 else
4380 msb = 1, lsb = 0;
4381
4382 *skip = 0;
4383
4384 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4385 {
4386 c1 = (src[msb] << 8) | (src[lsb]);
4387 src += 2;
4388
4389 if (c1 == '\n' || c1 == '\r')
4390 {
4391 if (*skip == 0)
4392 *skip = src - 2 - source;
4393 total++;
4394 if (c1 == '\n')
4395 {
4396 this_eol_type = CODING_EOL_LF;
4397 }
4398 else
4399 {
4400 if ((src + 1) >= src_end)
4401 {
4402 this_eol_type = CODING_EOL_CR;
4403 }
4404 else
4405 {
4406 c2 = (src[msb] << 8) | (src[lsb]);
4407 if (c2 == '\n')
4408 this_eol_type = CODING_EOL_CRLF, src += 2;
4409 else
4410 this_eol_type = CODING_EOL_CR;
4411 }
4412 }
4413
4414 if (eol_type == CODING_EOL_UNDECIDED)
4415 /* This is the first end-of-line. */
4416 eol_type = this_eol_type;
4417 else if (eol_type != this_eol_type)
4418 {
4419 /* The found type is different from what found before. */
4420 eol_type = CODING_EOL_INCONSISTENT;
4421 break;
4422 }
4423 }
4424 }
4425
4426 if (*skip == 0)
4427 *skip = src_end - source;
4428 return eol_type;
4429 }
4430
4431 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4432 is encoded. If it detects an appropriate format of end-of-line, it
4433 sets the information in *CODING. */
4434
4435 void
4436 detect_eol (coding, src, src_bytes)
4437 struct coding_system *coding;
4438 const unsigned char *src;
4439 int src_bytes;
4440 {
4441 Lisp_Object val;
4442 int skip;
4443 int eol_type;
4444
4445 switch (coding->category_idx)
4446 {
4447 case CODING_CATEGORY_IDX_UTF_16_BE:
4448 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4449 break;
4450 case CODING_CATEGORY_IDX_UTF_16_LE:
4451 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4452 break;
4453 default:
4454 eol_type = detect_eol_type (src, src_bytes, &skip);
4455 break;
4456 }
4457
4458 if (coding->heading_ascii > skip)
4459 coding->heading_ascii = skip;
4460 else
4461 skip = coding->heading_ascii;
4462
4463 if (eol_type == CODING_EOL_UNDECIDED)
4464 return;
4465 if (eol_type == CODING_EOL_INCONSISTENT)
4466 {
4467 #if 0
4468 /* This code is suppressed until we find a better way to
4469 distinguish raw text file and binary file. */
4470
4471 /* If we have already detected that the coding is raw-text, the
4472 coding should actually be no-conversion. */
4473 if (coding->type == coding_type_raw_text)
4474 {
4475 setup_coding_system (Qno_conversion, coding);
4476 return;
4477 }
4478 /* Else, let's decode only text code anyway. */
4479 #endif /* 0 */
4480 eol_type = CODING_EOL_LF;
4481 }
4482
4483 val = Fget (coding->symbol, Qeol_type);
4484 if (VECTORP (val) && XVECTOR (val)->size == 3)
4485 {
4486 int src_multibyte = coding->src_multibyte;
4487 int dst_multibyte = coding->dst_multibyte;
4488 struct composition_data *cmp_data = coding->cmp_data;
4489
4490 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4491 coding->src_multibyte = src_multibyte;
4492 coding->dst_multibyte = dst_multibyte;
4493 coding->heading_ascii = skip;
4494 coding->cmp_data = cmp_data;
4495 }
4496 }
4497
4498 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4499
4500 #define DECODING_BUFFER_MAG(coding) \
4501 (coding->type == coding_type_iso2022 \
4502 ? 3 \
4503 : (coding->type == coding_type_ccl \
4504 ? coding->spec.ccl.decoder.buf_magnification \
4505 : 2))
4506
4507 /* Return maximum size (bytes) of a buffer enough for decoding
4508 SRC_BYTES of text encoded in CODING. */
4509
4510 int
4511 decoding_buffer_size (coding, src_bytes)
4512 struct coding_system *coding;
4513 int src_bytes;
4514 {
4515 return (src_bytes * DECODING_BUFFER_MAG (coding)
4516 + CONVERSION_BUFFER_EXTRA_ROOM);
4517 }
4518
4519 /* Return maximum size (bytes) of a buffer enough for encoding
4520 SRC_BYTES of text to CODING. */
4521
4522 int
4523 encoding_buffer_size (coding, src_bytes)
4524 struct coding_system *coding;
4525 int src_bytes;
4526 {
4527 int magnification;
4528
4529 if (coding->type == coding_type_ccl)
4530 {
4531 magnification = coding->spec.ccl.encoder.buf_magnification;
4532 if (coding->eol_type == CODING_EOL_CRLF)
4533 magnification *= 2;
4534 }
4535 else if (CODING_REQUIRE_ENCODING (coding))
4536 magnification = 3;
4537 else
4538 magnification = 1;
4539
4540 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4541 }
4542
4543 /* Working buffer for code conversion. */
4544 struct conversion_buffer
4545 {
4546 int size; /* size of data. */
4547 int on_stack; /* 1 if allocated by alloca. */
4548 unsigned char *data;
4549 };
4550
4551 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */
4552 #define allocate_conversion_buffer(buf, len) \
4553 do { \
4554 if (len < MAX_ALLOCA) \
4555 { \
4556 buf.data = (unsigned char *) alloca (len); \
4557 buf.on_stack = 1; \
4558 } \
4559 else \
4560 { \
4561 buf.data = (unsigned char *) xmalloc (len); \
4562 buf.on_stack = 0; \
4563 } \
4564 buf.size = len; \
4565 } while (0)
4566
4567 /* Double the allocated memory for *BUF. */
4568 static void
4569 extend_conversion_buffer (buf)
4570 struct conversion_buffer *buf;
4571 {
4572 if (buf->on_stack)
4573 {
4574 unsigned char *save = buf->data;
4575 buf->data = (unsigned char *) xmalloc (buf->size * 2);
4576 bcopy (save, buf->data, buf->size);
4577 buf->on_stack = 0;
4578 }
4579 else
4580 {
4581 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4582 }
4583 buf->size *= 2;
4584 }
4585
4586 /* Free the allocated memory for BUF if it is not on stack. */
4587 static void
4588 free_conversion_buffer (buf)
4589 struct conversion_buffer *buf;
4590 {
4591 if (!buf->on_stack)
4592 xfree (buf->data);
4593 }
4594
4595 int
4596 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4597 struct coding_system *coding;
4598 unsigned char *source, *destination;
4599 int src_bytes, dst_bytes, encodep;
4600 {
4601 struct ccl_program *ccl
4602 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4603 unsigned char *dst = destination;
4604
4605 ccl->suppress_error = coding->suppress_error;
4606 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4607 if (encodep)
4608 {
4609 /* On encoding, EOL format is converted within ccl_driver. For
4610 that, setup proper information in the structure CCL. */
4611 ccl->eol_type = coding->eol_type;
4612 if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4613 ccl->eol_type = CODING_EOL_LF;
4614 ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4615 ccl->eight_bit_control = coding->dst_multibyte;
4616 }
4617 else
4618 ccl->eight_bit_control = 1;
4619 ccl->multibyte = coding->src_multibyte;
4620 if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4621 {
4622 /* Move carryover bytes to DESTINATION. */
4623 unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4624 while (*p)
4625 *dst++ = *p++;
4626 coding->spec.ccl.eight_bit_carryover[0] = 0;
4627 if (dst_bytes)
4628 dst_bytes -= dst - destination;
4629 }
4630
4631 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4632 &(coding->consumed))
4633 + dst - destination);
4634
4635 if (encodep)
4636 {
4637 coding->produced_char = coding->produced;
4638 coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4639 }
4640 else if (!ccl->eight_bit_control)
4641 {
4642 /* The produced bytes forms a valid multibyte sequence. */
4643 coding->produced_char
4644 = multibyte_chars_in_text (destination, coding->produced);
4645 coding->spec.ccl.eight_bit_carryover[0] = 0;
4646 }
4647 else
4648 {
4649 /* On decoding, the destination should always multibyte. But,
4650 CCL program might have been generated an invalid multibyte
4651 sequence. Here we make such a sequence valid as
4652 multibyte. */
4653 int bytes
4654 = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4655
4656 if ((coding->consumed < src_bytes
4657 || !ccl->last_block)
4658 && coding->produced >= 1
4659 && destination[coding->produced - 1] >= 0x80)
4660 {
4661 /* We should not convert the tailing 8-bit codes to
4662 multibyte form even if they doesn't form a valid
4663 multibyte sequence. They may form a valid sequence in
4664 the next call. */
4665 int carryover = 0;
4666
4667 if (destination[coding->produced - 1] < 0xA0)
4668 carryover = 1;
4669 else if (coding->produced >= 2)
4670 {
4671 if (destination[coding->produced - 2] >= 0x80)
4672 {
4673 if (destination[coding->produced - 2] < 0xA0)
4674 carryover = 2;
4675 else if (coding->produced >= 3
4676 && destination[coding->produced - 3] >= 0x80
4677 && destination[coding->produced - 3] < 0xA0)
4678 carryover = 3;
4679 }
4680 }
4681 if (carryover > 0)
4682 {
4683 BCOPY_SHORT (destination + coding->produced - carryover,
4684 coding->spec.ccl.eight_bit_carryover,
4685 carryover);
4686 coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4687 coding->produced -= carryover;
4688 }
4689 }
4690 coding->produced = str_as_multibyte (destination, bytes,
4691 coding->produced,
4692 &(coding->produced_char));
4693 }
4694
4695 switch (ccl->status)
4696 {
4697 case CCL_STAT_SUSPEND_BY_SRC:
4698 coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4699 break;
4700 case CCL_STAT_SUSPEND_BY_DST:
4701 coding->result = CODING_FINISH_INSUFFICIENT_DST;
4702 break;
4703 case CCL_STAT_QUIT:
4704 case CCL_STAT_INVALID_CMD:
4705 coding->result = CODING_FINISH_INTERRUPT;
4706 break;
4707 default:
4708 coding->result = CODING_FINISH_NORMAL;
4709 break;
4710 }
4711 return coding->result;
4712 }
4713
4714 /* Decode EOL format of the text at PTR of BYTES length destructively
4715 according to CODING->eol_type. This is called after the CCL
4716 program produced a decoded text at PTR. If we do CRLF->LF
4717 conversion, update CODING->produced and CODING->produced_char. */
4718
4719 static void
4720 decode_eol_post_ccl (coding, ptr, bytes)
4721 struct coding_system *coding;
4722 unsigned char *ptr;
4723 int bytes;
4724 {
4725 Lisp_Object val, saved_coding_symbol;
4726 unsigned char *pend = ptr + bytes;
4727 int dummy;
4728
4729 /* Remember the current coding system symbol. We set it back when
4730 an inconsistent EOL is found so that `last-coding-system-used' is
4731 set to the coding system that doesn't specify EOL conversion. */
4732 saved_coding_symbol = coding->symbol;
4733
4734 coding->spec.ccl.cr_carryover = 0;
4735 if (coding->eol_type == CODING_EOL_UNDECIDED)
4736 {
4737 /* Here, to avoid the call of setup_coding_system, we directly
4738 call detect_eol_type. */
4739 coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4740 if (coding->eol_type == CODING_EOL_INCONSISTENT)
4741 coding->eol_type = CODING_EOL_LF;
4742 if (coding->eol_type != CODING_EOL_UNDECIDED)
4743 {
4744 val = Fget (coding->symbol, Qeol_type);
4745 if (VECTORP (val) && XVECTOR (val)->size == 3)
4746 coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4747 }
4748 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4749 }
4750
4751 if (coding->eol_type == CODING_EOL_LF
4752 || coding->eol_type == CODING_EOL_UNDECIDED)
4753 {
4754 /* We have nothing to do. */
4755 ptr = pend;
4756 }
4757 else if (coding->eol_type == CODING_EOL_CRLF)
4758 {
4759 unsigned char *pstart = ptr, *p = ptr;
4760
4761 if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4762 && *(pend - 1) == '\r')
4763 {
4764 /* If the last character is CR, we can't handle it here
4765 because LF will be in the not-yet-decoded source text.
4766 Record that the CR is not yet processed. */
4767 coding->spec.ccl.cr_carryover = 1;
4768 coding->produced--;
4769 coding->produced_char--;
4770 pend--;
4771 }
4772 while (ptr < pend)
4773 {
4774 if (*ptr == '\r')
4775 {
4776 if (ptr + 1 < pend && *(ptr + 1) == '\n')
4777 {
4778 *p++ = '\n';
4779 ptr += 2;
4780 }
4781 else
4782 {
4783 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4784 goto undo_eol_conversion;
4785 *p++ = *ptr++;
4786 }
4787 }
4788 else if (*ptr == '\n'
4789 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4790 goto undo_eol_conversion;
4791 else
4792 *p++ = *ptr++;
4793 continue;
4794
4795 undo_eol_conversion:
4796 /* We have faced with inconsistent EOL format at PTR.
4797 Convert all LFs before PTR back to CRLFs. */
4798 for (p--, ptr--; p >= pstart; p--)
4799 {
4800 if (*p == '\n')
4801 *ptr-- = '\n', *ptr-- = '\r';
4802 else
4803 *ptr-- = *p;
4804 }
4805 /* If carryover is recorded, cancel it because we don't
4806 convert CRLF anymore. */
4807 if (coding->spec.ccl.cr_carryover)
4808 {
4809 coding->spec.ccl.cr_carryover = 0;
4810 coding->produced++;
4811 coding->produced_char++;
4812 pend++;
4813 }
4814 p = ptr = pend;
4815 coding->eol_type = CODING_EOL_LF;
4816 coding->symbol = saved_coding_symbol;
4817 }
4818 if (p < pend)
4819 {
4820 /* As each two-byte sequence CRLF was converted to LF, (PEND
4821 - P) is the number of deleted characters. */
4822 coding->produced -= pend - p;
4823 coding->produced_char -= pend - p;
4824 }
4825 }
4826 else /* i.e. coding->eol_type == CODING_EOL_CR */
4827 {
4828 unsigned char *p = ptr;
4829
4830 for (; ptr < pend; ptr++)
4831 {
4832 if (*ptr == '\r')
4833 *ptr = '\n';
4834 else if (*ptr == '\n'
4835 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4836 {
4837 for (; p < ptr; p++)
4838 {
4839 if (*p == '\n')
4840 *p = '\r';
4841 }
4842 ptr = pend;
4843 coding->eol_type = CODING_EOL_LF;
4844 coding->symbol = saved_coding_symbol;
4845 }
4846 }
4847 }
4848 }
4849
4850 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
4851 decoding, it may detect coding system and format of end-of-line if
4852 those are not yet decided. The source should be unibyte, the
4853 result is multibyte if CODING->dst_multibyte is nonzero, else
4854 unibyte. */
4855
4856 int
4857 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4858 struct coding_system *coding;
4859 const unsigned char *source;
4860 unsigned char *destination;
4861 int src_bytes, dst_bytes;
4862 {
4863 int extra = 0;
4864
4865 if (coding->type == coding_type_undecided)
4866 detect_coding (coding, source, src_bytes);
4867
4868 if (coding->eol_type == CODING_EOL_UNDECIDED
4869 && coding->type != coding_type_ccl)
4870 {
4871 detect_eol (coding, source, src_bytes);
4872 /* We had better recover the original eol format if we
4873 encounter an inconsistent eol format while decoding. */
4874 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4875 }
4876
4877 coding->produced = coding->produced_char = 0;
4878 coding->consumed = coding->consumed_char = 0;
4879 coding->errors = 0;
4880 coding->result = CODING_FINISH_NORMAL;
4881
4882 switch (coding->type)
4883 {
4884 case coding_type_sjis:
4885 decode_coding_sjis_big5 (coding, source, destination,
4886 src_bytes, dst_bytes, 1);
4887 break;
4888
4889 case coding_type_iso2022:
4890 decode_coding_iso2022 (coding, source, destination,
4891 src_bytes, dst_bytes);
4892 break;
4893
4894 case coding_type_big5:
4895 decode_coding_sjis_big5 (coding, source, destination,
4896 src_bytes, dst_bytes, 0);
4897 break;
4898
4899 case coding_type_emacs_mule:
4900 decode_coding_emacs_mule (coding, source, destination,
4901 src_bytes, dst_bytes);
4902 break;
4903
4904 case coding_type_ccl:
4905 if (coding->spec.ccl.cr_carryover)
4906 {
4907 /* Put the CR which was not processed by the previous call
4908 of decode_eol_post_ccl in DESTINATION. It will be
4909 decoded together with the following LF by the call to
4910 decode_eol_post_ccl below. */
4911 *destination = '\r';
4912 coding->produced++;
4913 coding->produced_char++;
4914 dst_bytes--;
4915 extra = coding->spec.ccl.cr_carryover;
4916 }
4917 ccl_coding_driver (coding, source, destination + extra,
4918 src_bytes, dst_bytes, 0);
4919 if (coding->eol_type != CODING_EOL_LF)
4920 {
4921 coding->produced += extra;
4922 coding->produced_char += extra;
4923 decode_eol_post_ccl (coding, destination, coding->produced);
4924 }
4925 break;
4926
4927 default:
4928 decode_eol (coding, source, destination, src_bytes, dst_bytes);
4929 }
4930
4931 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4932 && coding->mode & CODING_MODE_LAST_BLOCK
4933 && coding->consumed == src_bytes)
4934 coding->result = CODING_FINISH_NORMAL;
4935
4936 if (coding->mode & CODING_MODE_LAST_BLOCK
4937 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4938 {
4939 const unsigned char *src = source + coding->consumed;
4940 unsigned char *dst = destination + coding->produced;
4941
4942 src_bytes -= coding->consumed;
4943 coding->errors++;
4944 if (COMPOSING_P (coding))
4945 DECODE_COMPOSITION_END ('1');
4946 while (src_bytes--)
4947 {
4948 int c = *src++;
4949 dst += CHAR_STRING (c, dst);
4950 coding->produced_char++;
4951 }
4952 coding->consumed = coding->consumed_char = src - source;
4953 coding->produced = dst - destination;
4954 coding->result = CODING_FINISH_NORMAL;
4955 }
4956
4957 if (!coding->dst_multibyte)
4958 {
4959 coding->produced = str_as_unibyte (destination, coding->produced);
4960 coding->produced_char = coding->produced;
4961 }
4962
4963 return coding->result;
4964 }
4965
4966 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4967 multibyteness of the source is CODING->src_multibyte, the
4968 multibyteness of the result is always unibyte. */
4969
4970 int
4971 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4972 struct coding_system *coding;
4973 const unsigned char *source;
4974 unsigned char *destination;
4975 int src_bytes, dst_bytes;
4976 {
4977 coding->produced = coding->produced_char = 0;
4978 coding->consumed = coding->consumed_char = 0;
4979 coding->errors = 0;
4980 coding->result = CODING_FINISH_NORMAL;
4981
4982 switch (coding->type)
4983 {
4984 case coding_type_sjis:
4985 encode_coding_sjis_big5 (coding, source, destination,
4986 src_bytes, dst_bytes, 1);
4987 break;
4988
4989 case coding_type_iso2022:
4990 encode_coding_iso2022 (coding, source, destination,
4991 src_bytes, dst_bytes);
4992 break;
4993
4994 case coding_type_big5:
4995 encode_coding_sjis_big5 (coding, source, destination,
4996 src_bytes, dst_bytes, 0);
4997 break;
4998
4999 case coding_type_emacs_mule:
5000 encode_coding_emacs_mule (coding, source, destination,
5001 src_bytes, dst_bytes);
5002 break;
5003
5004 case coding_type_ccl:
5005 ccl_coding_driver (coding, source, destination,
5006 src_bytes, dst_bytes, 1);
5007 break;
5008
5009 default:
5010 encode_eol (coding, source, destination, src_bytes, dst_bytes);
5011 }
5012
5013 if (coding->mode & CODING_MODE_LAST_BLOCK
5014 && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5015 {
5016 const unsigned char *src = source + coding->consumed;
5017 unsigned char *dst = destination + coding->produced;
5018
5019 if (coding->type == coding_type_iso2022)
5020 ENCODE_RESET_PLANE_AND_REGISTER;
5021 if (COMPOSING_P (coding))
5022 *dst++ = ISO_CODE_ESC, *dst++ = '1';
5023 if (coding->consumed < src_bytes)
5024 {
5025 int len = src_bytes - coding->consumed;
5026
5027 BCOPY_SHORT (src, dst, len);
5028 if (coding->src_multibyte)
5029 len = str_as_unibyte (dst, len);
5030 dst += len;
5031 coding->consumed = src_bytes;
5032 }
5033 coding->produced = coding->produced_char = dst - destination;
5034 coding->result = CODING_FINISH_NORMAL;
5035 }
5036
5037 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5038 && coding->consumed == src_bytes)
5039 coding->result = CODING_FINISH_NORMAL;
5040
5041 return coding->result;
5042 }
5043
5044 /* Scan text in the region between *BEG and *END (byte positions),
5045 skip characters which we don't have to decode by coding system
5046 CODING at the head and tail, then set *BEG and *END to the region
5047 of the text we actually have to convert. The caller should move
5048 the gap out of the region in advance if the region is from a
5049 buffer.
5050
5051 If STR is not NULL, *BEG and *END are indices into STR. */
5052
5053 static void
5054 shrink_decoding_region (beg, end, coding, str)
5055 int *beg, *end;
5056 struct coding_system *coding;
5057 unsigned char *str;
5058 {
5059 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5060 int eol_conversion;
5061 Lisp_Object translation_table;
5062
5063 if (coding->type == coding_type_ccl
5064 || coding->type == coding_type_undecided
5065 || coding->eol_type != CODING_EOL_LF
5066 || !NILP (coding->post_read_conversion)
5067 || coding->composing != COMPOSITION_DISABLED)
5068 {
5069 /* We can't skip any data. */
5070 return;
5071 }
5072 if (coding->type == coding_type_no_conversion
5073 || coding->type == coding_type_raw_text
5074 || coding->type == coding_type_emacs_mule)
5075 {
5076 /* We need no conversion, but don't have to skip any data here.
5077 Decoding routine handles them effectively anyway. */
5078 return;
5079 }
5080
5081 translation_table = coding->translation_table_for_decode;
5082 if (NILP (translation_table) && !NILP (Venable_character_translation))
5083 translation_table = Vstandard_translation_table_for_decode;
5084 if (CHAR_TABLE_P (translation_table))
5085 {
5086 int i;
5087 for (i = 0; i < 128; i++)
5088 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5089 break;
5090 if (i < 128)
5091 /* Some ASCII character should be translated. We give up
5092 shrinking. */
5093 return;
5094 }
5095
5096 if (coding->heading_ascii >= 0)
5097 /* Detection routine has already found how much we can skip at the
5098 head. */
5099 *beg += coding->heading_ascii;
5100
5101 if (str)
5102 {
5103 begp_orig = begp = str + *beg;
5104 endp_orig = endp = str + *end;
5105 }
5106 else
5107 {
5108 begp_orig = begp = BYTE_POS_ADDR (*beg);
5109 endp_orig = endp = begp + *end - *beg;
5110 }
5111
5112 eol_conversion = (coding->eol_type == CODING_EOL_CR
5113 || coding->eol_type == CODING_EOL_CRLF);
5114
5115 switch (coding->type)
5116 {
5117 case coding_type_sjis:
5118 case coding_type_big5:
5119 /* We can skip all ASCII characters at the head. */
5120 if (coding->heading_ascii < 0)
5121 {
5122 if (eol_conversion)
5123 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5124 else
5125 while (begp < endp && *begp < 0x80) begp++;
5126 }
5127 /* We can skip all ASCII characters at the tail except for the
5128 second byte of SJIS or BIG5 code. */
5129 if (eol_conversion)
5130 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5131 else
5132 while (begp < endp && endp[-1] < 0x80) endp--;
5133 /* Do not consider LF as ascii if preceded by CR, since that
5134 confuses eol decoding. */
5135 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5136 endp++;
5137 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5138 endp++;
5139 break;
5140
5141 case coding_type_iso2022:
5142 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5143 /* We can't skip any data. */
5144 break;
5145 if (coding->heading_ascii < 0)
5146 {
5147 /* We can skip all ASCII characters at the head except for a
5148 few control codes. */
5149 while (begp < endp && (c = *begp) < 0x80
5150 && c != ISO_CODE_CR && c != ISO_CODE_SO
5151 && c != ISO_CODE_SI && c != ISO_CODE_ESC
5152 && (!eol_conversion || c != ISO_CODE_LF))
5153 begp++;
5154 }
5155 switch (coding->category_idx)
5156 {
5157 case CODING_CATEGORY_IDX_ISO_8_1:
5158 case CODING_CATEGORY_IDX_ISO_8_2:
5159 /* We can skip all ASCII characters at the tail. */
5160 if (eol_conversion)
5161 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5162 else
5163 while (begp < endp && endp[-1] < 0x80) endp--;
5164 /* Do not consider LF as ascii if preceded by CR, since that
5165 confuses eol decoding. */
5166 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5167 endp++;
5168 break;
5169
5170 case CODING_CATEGORY_IDX_ISO_7:
5171 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5172 {
5173 /* We can skip all characters at the tail except for 8-bit
5174 codes and ESC and the following 2-byte at the tail. */
5175 unsigned char *eight_bit = NULL;
5176
5177 if (eol_conversion)
5178 while (begp < endp
5179 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5180 {
5181 if (!eight_bit && c & 0x80) eight_bit = endp;
5182 endp--;
5183 }
5184 else
5185 while (begp < endp
5186 && (c = endp[-1]) != ISO_CODE_ESC)
5187 {
5188 if (!eight_bit && c & 0x80) eight_bit = endp;
5189 endp--;
5190 }
5191 /* Do not consider LF as ascii if preceded by CR, since that
5192 confuses eol decoding. */
5193 if (begp < endp && endp < endp_orig
5194 && endp[-1] == '\r' && endp[0] == '\n')
5195 endp++;
5196 if (begp < endp && endp[-1] == ISO_CODE_ESC)
5197 {
5198 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5199 /* This is an ASCII designation sequence. We can
5200 surely skip the tail. But, if we have
5201 encountered an 8-bit code, skip only the codes
5202 after that. */
5203 endp = eight_bit ? eight_bit : endp + 2;
5204 else
5205 /* Hmmm, we can't skip the tail. */
5206 endp = endp_orig;
5207 }
5208 else if (eight_bit)
5209 endp = eight_bit;
5210 }
5211 }
5212 break;
5213
5214 default:
5215 abort ();
5216 }
5217 *beg += begp - begp_orig;
5218 *end += endp - endp_orig;
5219 return;
5220 }
5221
5222 /* Like shrink_decoding_region but for encoding. */
5223
5224 static void
5225 shrink_encoding_region (beg, end, coding, str)
5226 int *beg, *end;
5227 struct coding_system *coding;
5228 unsigned char *str;
5229 {
5230 unsigned char *begp_orig, *begp, *endp_orig, *endp;
5231 int eol_conversion;
5232 Lisp_Object translation_table;
5233
5234 if (coding->type == coding_type_ccl
5235 || coding->eol_type == CODING_EOL_CRLF
5236 || coding->eol_type == CODING_EOL_CR
5237 || (coding->cmp_data && coding->cmp_data->used > 0))
5238 {
5239 /* We can't skip any data. */
5240 return;
5241 }
5242 if (coding->type == coding_type_no_conversion
5243 || coding->type == coding_type_raw_text
5244 || coding->type == coding_type_emacs_mule
5245 || coding->type == coding_type_undecided)
5246 {
5247 /* We need no conversion, but don't have to skip any data here.
5248 Encoding routine handles them effectively anyway. */
5249 return;
5250 }
5251
5252 translation_table = coding->translation_table_for_encode;
5253 if (NILP (translation_table) && !NILP (Venable_character_translation))
5254 translation_table = Vstandard_translation_table_for_encode;
5255 if (CHAR_TABLE_P (translation_table))
5256 {
5257 int i;
5258 for (i = 0; i < 128; i++)
5259 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5260 break;
5261 if (i < 128)
5262 /* Some ASCII character should be translated. We give up
5263 shrinking. */
5264 return;
5265 }
5266
5267 if (str)
5268 {
5269 begp_orig = begp = str + *beg;
5270 endp_orig = endp = str + *end;
5271 }
5272 else
5273 {
5274 begp_orig = begp = BYTE_POS_ADDR (*beg);
5275 endp_orig = endp = begp + *end - *beg;
5276 }
5277
5278 eol_conversion = (coding->eol_type == CODING_EOL_CR
5279 || coding->eol_type == CODING_EOL_CRLF);
5280
5281 /* Here, we don't have to check coding->pre_write_conversion because
5282 the caller is expected to have handled it already. */
5283 switch (coding->type)
5284 {
5285 case coding_type_iso2022:
5286 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5287 /* We can't skip any data. */
5288 break;
5289 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5290 {
5291 unsigned char *bol = begp;
5292 while (begp < endp && *begp < 0x80)
5293 {
5294 begp++;
5295 if (begp[-1] == '\n')
5296 bol = begp;
5297 }
5298 begp = bol;
5299 goto label_skip_tail;
5300 }
5301 /* fall down ... */
5302
5303 case coding_type_sjis:
5304 case coding_type_big5:
5305 /* We can skip all ASCII characters at the head and tail. */
5306 if (eol_conversion)
5307 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5308 else
5309 while (begp < endp && *begp < 0x80) begp++;
5310 label_skip_tail:
5311 if (eol_conversion)
5312 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5313 else
5314 while (begp < endp && *(endp - 1) < 0x80) endp--;
5315 break;
5316
5317 default:
5318 abort ();
5319 }
5320
5321 *beg += begp - begp_orig;
5322 *end += endp - endp_orig;
5323 return;
5324 }
5325
5326 /* As shrinking conversion region requires some overhead, we don't try
5327 shrinking if the length of conversion region is less than this
5328 value. */
5329 static int shrink_conversion_region_threshhold = 1024;
5330
5331 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
5332 do { \
5333 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
5334 { \
5335 if (encodep) shrink_encoding_region (beg, end, coding, str); \
5336 else shrink_decoding_region (beg, end, coding, str); \
5337 } \
5338 } while (0)
5339
5340 static Lisp_Object
5341 code_convert_region_unwind (arg)
5342 Lisp_Object arg;
5343 {
5344 inhibit_pre_post_conversion = 0;
5345 Vlast_coding_system_used = arg;
5346 return Qnil;
5347 }
5348
5349 /* Store information about all compositions in the range FROM and TO
5350 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
5351 buffer or a string, defaults to the current buffer. */
5352
5353 void
5354 coding_save_composition (coding, from, to, obj)
5355 struct coding_system *coding;
5356 int from, to;
5357 Lisp_Object obj;
5358 {
5359 Lisp_Object prop;
5360 int start, end;
5361
5362 if (coding->composing == COMPOSITION_DISABLED)
5363 return;
5364 if (!coding->cmp_data)
5365 coding_allocate_composition_data (coding, from);
5366 if (!find_composition (from, to, &start, &end, &prop, obj)
5367 || end > to)
5368 return;
5369 if (start < from
5370 && (!find_composition (end, to, &start, &end, &prop, obj)
5371 || end > to))
5372 return;
5373 coding->composing = COMPOSITION_NO;
5374 do
5375 {
5376 if (COMPOSITION_VALID_P (start, end, prop))
5377 {
5378 enum composition_method method = COMPOSITION_METHOD (prop);
5379 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5380 >= COMPOSITION_DATA_SIZE)
5381 coding_allocate_composition_data (coding, from);
5382 /* For relative composition, we remember start and end
5383 positions, for the other compositions, we also remember
5384 components. */
5385 CODING_ADD_COMPOSITION_START (coding, start - from, method);
5386 if (method != COMPOSITION_RELATIVE)
5387 {
5388 /* We must store a*/
5389 Lisp_Object val, ch;
5390
5391 val = COMPOSITION_COMPONENTS (prop);
5392 if (CONSP (val))
5393 while (CONSP (val))
5394 {
5395 ch = XCAR (val), val = XCDR (val);
5396 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5397 }
5398 else if (VECTORP (val) || STRINGP (val))
5399 {
5400 int len = (VECTORP (val)
5401 ? XVECTOR (val)->size : SCHARS (val));
5402 int i;
5403 for (i = 0; i < len; i++)
5404 {
5405 ch = (STRINGP (val)
5406 ? Faref (val, make_number (i))
5407 : XVECTOR (val)->contents[i]);
5408 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5409 }
5410 }
5411 else /* INTEGERP (val) */
5412 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5413 }
5414 CODING_ADD_COMPOSITION_END (coding, end - from);
5415 }
5416 start = end;
5417 }
5418 while (start < to
5419 && find_composition (start, to, &start, &end, &prop, obj)
5420 && end <= to);
5421
5422 /* Make coding->cmp_data point to the first memory block. */
5423 while (coding->cmp_data->prev)
5424 coding->cmp_data = coding->cmp_data->prev;
5425 coding->cmp_data_start = 0;
5426 }
5427
5428 /* Reflect the saved information about compositions to OBJ.
5429 CODING->cmp_data points to a memory block for the information. OBJ
5430 is a buffer or a string, defaults to the current buffer. */
5431
5432 void
5433 coding_restore_composition (coding, obj)
5434 struct coding_system *coding;
5435 Lisp_Object obj;
5436 {
5437 struct composition_data *cmp_data = coding->cmp_data;
5438
5439 if (!cmp_data)
5440 return;
5441
5442 while (cmp_data->prev)
5443 cmp_data = cmp_data->prev;
5444
5445 while (cmp_data)
5446 {
5447 int i;
5448
5449 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5450 i += cmp_data->data[i])
5451 {
5452 int *data = cmp_data->data + i;
5453 enum composition_method method = (enum composition_method) data[3];
5454 Lisp_Object components;
5455
5456 if (data[0] < 0 || i + data[0] > cmp_data->used)
5457 /* Invalid composition data. */
5458 break;
5459
5460 if (method == COMPOSITION_RELATIVE)
5461 components = Qnil;
5462 else
5463 {
5464 int len = data[0] - 4, j;
5465 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5466
5467 if (method == COMPOSITION_WITH_RULE_ALTCHARS
5468 && len % 2 == 0)
5469 len --;
5470 if (len < 1)
5471 /* Invalid composition data. */
5472 break;
5473 for (j = 0; j < len; j++)
5474 args[j] = make_number (data[4 + j]);
5475 components = (method == COMPOSITION_WITH_ALTCHARS
5476 ? Fstring (len, args)
5477 : Fvector (len, args));
5478 }
5479 compose_text (data[1], data[2], components, Qnil, obj);
5480 }
5481 cmp_data = cmp_data->next;
5482 }
5483 }
5484
5485 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5486 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5487 coding system CODING, and return the status code of code conversion
5488 (currently, this value has no meaning).
5489
5490 How many characters (and bytes) are converted to how many
5491 characters (and bytes) are recorded in members of the structure
5492 CODING.
5493
5494 If REPLACE is nonzero, we do various things as if the original text
5495 is deleted and a new text is inserted. See the comments in
5496 replace_range (insdel.c) to know what we are doing.
5497
5498 If REPLACE is zero, it is assumed that the source text is unibyte.
5499 Otherwise, it is assumed that the source text is multibyte. */
5500
5501 int
5502 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5503 int from, from_byte, to, to_byte, encodep, replace;
5504 struct coding_system *coding;
5505 {
5506 int len = to - from, len_byte = to_byte - from_byte;
5507 int nchars_del = 0, nbytes_del = 0;
5508 int require, inserted, inserted_byte;
5509 int head_skip, tail_skip, total_skip = 0;
5510 Lisp_Object saved_coding_symbol;
5511 int first = 1;
5512 unsigned char *src, *dst;
5513 Lisp_Object deletion;
5514 int orig_point = PT, orig_len = len;
5515 int prev_Z;
5516 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5517
5518 deletion = Qnil;
5519 saved_coding_symbol = coding->symbol;
5520
5521 if (from < PT && PT < to)
5522 {
5523 TEMP_SET_PT_BOTH (from, from_byte);
5524 orig_point = from;
5525 }
5526
5527 if (replace)
5528 {
5529 int saved_from = from;
5530 int saved_inhibit_modification_hooks;
5531
5532 prepare_to_modify_buffer (from, to, &from);
5533 if (saved_from != from)
5534 {
5535 to = from + len;
5536 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5537 len_byte = to_byte - from_byte;
5538 }
5539
5540 /* The code conversion routine can not preserve text properties
5541 for now. So, we must remove all text properties in the
5542 region. Here, we must suppress all modification hooks. */
5543 saved_inhibit_modification_hooks = inhibit_modification_hooks;
5544 inhibit_modification_hooks = 1;
5545 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5546 inhibit_modification_hooks = saved_inhibit_modification_hooks;
5547 }
5548
5549 if (! encodep && CODING_REQUIRE_DETECTION (coding))
5550 {
5551 /* We must detect encoding of text and eol format. */
5552
5553 if (from < GPT && to > GPT)
5554 move_gap_both (from, from_byte);
5555 if (coding->type == coding_type_undecided)
5556 {
5557 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5558 if (coding->type == coding_type_undecided)
5559 {
5560 /* It seems that the text contains only ASCII, but we
5561 should not leave it undecided because the deeper
5562 decoding routine (decode_coding) tries to detect the
5563 encodings again in vain. */
5564 coding->type = coding_type_emacs_mule;
5565 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5566 /* As emacs-mule decoder will handle composition, we
5567 need this setting to allocate coding->cmp_data
5568 later. */
5569 coding->composing = COMPOSITION_NO;
5570 }
5571 }
5572 if (coding->eol_type == CODING_EOL_UNDECIDED
5573 && coding->type != coding_type_ccl)
5574 {
5575 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5576 if (coding->eol_type == CODING_EOL_UNDECIDED)
5577 coding->eol_type = CODING_EOL_LF;
5578 /* We had better recover the original eol format if we
5579 encounter an inconsistent eol format while decoding. */
5580 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5581 }
5582 }
5583
5584 /* Now we convert the text. */
5585
5586 /* For encoding, we must process pre-write-conversion in advance. */
5587 if (! inhibit_pre_post_conversion
5588 && encodep
5589 && SYMBOLP (coding->pre_write_conversion)
5590 && ! NILP (Ffboundp (coding->pre_write_conversion)))
5591 {
5592 /* The function in pre-write-conversion may put a new text in a
5593 new buffer. */
5594 struct buffer *prev = current_buffer;
5595 Lisp_Object new;
5596
5597 record_unwind_protect (code_convert_region_unwind,
5598 Vlast_coding_system_used);
5599 /* We should not call any more pre-write/post-read-conversion
5600 functions while this pre-write-conversion is running. */
5601 inhibit_pre_post_conversion = 1;
5602 call2 (coding->pre_write_conversion,
5603 make_number (from), make_number (to));
5604 inhibit_pre_post_conversion = 0;
5605 /* Discard the unwind protect. */
5606 specpdl_ptr--;
5607
5608 if (current_buffer != prev)
5609 {
5610 len = ZV - BEGV;
5611 new = Fcurrent_buffer ();
5612 set_buffer_internal_1 (prev);
5613 del_range_2 (from, from_byte, to, to_byte, 0);
5614 TEMP_SET_PT_BOTH (from, from_byte);
5615 insert_from_buffer (XBUFFER (new), 1, len, 0);
5616 Fkill_buffer (new);
5617 if (orig_point >= to)
5618 orig_point += len - orig_len;
5619 else if (orig_point > from)
5620 orig_point = from;
5621 orig_len = len;
5622 to = from + len;
5623 from_byte = CHAR_TO_BYTE (from);
5624 to_byte = CHAR_TO_BYTE (to);
5625 len_byte = to_byte - from_byte;
5626 TEMP_SET_PT_BOTH (from, from_byte);
5627 }
5628 }
5629
5630 if (replace)
5631 {
5632 if (! EQ (current_buffer->undo_list, Qt))
5633 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5634 else
5635 {
5636 nchars_del = to - from;
5637 nbytes_del = to_byte - from_byte;
5638 }
5639 }
5640
5641 if (coding->composing != COMPOSITION_DISABLED)
5642 {
5643 if (encodep)
5644 coding_save_composition (coding, from, to, Fcurrent_buffer ());
5645 else
5646 coding_allocate_composition_data (coding, from);
5647 }
5648
5649 /* Try to skip the heading and tailing ASCIIs. */
5650 if (coding->type != coding_type_ccl)
5651 {
5652 int from_byte_orig = from_byte, to_byte_orig = to_byte;
5653
5654 if (from < GPT && GPT < to)
5655 move_gap_both (from, from_byte);
5656 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5657 if (from_byte == to_byte
5658 && (encodep || NILP (coding->post_read_conversion))
5659 && ! CODING_REQUIRE_FLUSHING (coding))
5660 {
5661 coding->produced = len_byte;
5662 coding->produced_char = len;
5663 if (!replace)
5664 /* We must record and adjust for this new text now. */
5665 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5666 return 0;
5667 }
5668
5669 head_skip = from_byte - from_byte_orig;
5670 tail_skip = to_byte_orig - to_byte;
5671 total_skip = head_skip + tail_skip;
5672 from += head_skip;
5673 to -= tail_skip;
5674 len -= total_skip; len_byte -= total_skip;
5675 }
5676
5677 /* For conversion, we must put the gap before the text in addition to
5678 making the gap larger for efficient decoding. The required gap
5679 size starts from 2000 which is the magic number used in make_gap.
5680 But, after one batch of conversion, it will be incremented if we
5681 find that it is not enough . */
5682 require = 2000;
5683
5684 if (GAP_SIZE < require)
5685 make_gap (require - GAP_SIZE);
5686 move_gap_both (from, from_byte);
5687
5688 inserted = inserted_byte = 0;
5689
5690 GAP_SIZE += len_byte;
5691 ZV -= len;
5692 Z -= len;
5693 ZV_BYTE -= len_byte;
5694 Z_BYTE -= len_byte;
5695
5696 if (GPT - BEG < BEG_UNCHANGED)
5697 BEG_UNCHANGED = GPT - BEG;
5698 if (Z - GPT < END_UNCHANGED)
5699 END_UNCHANGED = Z - GPT;
5700
5701 if (!encodep && coding->src_multibyte)
5702 {
5703 /* Decoding routines expects that the source text is unibyte.
5704 We must convert 8-bit characters of multibyte form to
5705 unibyte. */
5706 int len_byte_orig = len_byte;
5707 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5708 if (len_byte < len_byte_orig)
5709 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5710 len_byte);
5711 coding->src_multibyte = 0;
5712 }
5713
5714 for (;;)
5715 {
5716 int result;
5717
5718 /* The buffer memory is now:
5719 +--------+converted-text+---------+-------original-text-------+---+
5720 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5721 |<---------------------- GAP ----------------------->| */
5722 src = GAP_END_ADDR - len_byte;
5723 dst = GPT_ADDR + inserted_byte;
5724
5725 if (encodep)
5726 result = encode_coding (coding, src, dst, len_byte, 0);
5727 else
5728 {
5729 if (coding->composing != COMPOSITION_DISABLED)
5730 coding->cmp_data->char_offset = from + inserted;
5731 result = decode_coding (coding, src, dst, len_byte, 0);
5732 }
5733
5734 /* The buffer memory is now:
5735 +--------+-------converted-text----+--+------original-text----+---+
5736 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5737 |<---------------------- GAP ----------------------->| */
5738
5739 inserted += coding->produced_char;
5740 inserted_byte += coding->produced;
5741 len_byte -= coding->consumed;
5742
5743 if (result == CODING_FINISH_INSUFFICIENT_CMP)
5744 {
5745 coding_allocate_composition_data (coding, from + inserted);
5746 continue;
5747 }
5748
5749 src += coding->consumed;
5750 dst += coding->produced;
5751
5752 if (result == CODING_FINISH_NORMAL)
5753 {
5754 src += len_byte;
5755 break;
5756 }
5757 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5758 {
5759 unsigned char *pend = dst, *p = pend - inserted_byte;
5760 Lisp_Object eol_type;
5761
5762 /* Encode LFs back to the original eol format (CR or CRLF). */
5763 if (coding->eol_type == CODING_EOL_CR)
5764 {
5765 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5766 }
5767 else
5768 {
5769 int count = 0;
5770
5771 while (p < pend) if (*p++ == '\n') count++;
5772 if (src - dst < count)
5773 {
5774 /* We don't have sufficient room for encoding LFs
5775 back to CRLF. We must record converted and
5776 not-yet-converted text back to the buffer
5777 content, enlarge the gap, then record them out of
5778 the buffer contents again. */
5779 int add = len_byte + inserted_byte;
5780
5781 GAP_SIZE -= add;
5782 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5783 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5784 make_gap (count - GAP_SIZE);
5785 GAP_SIZE += add;
5786 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5787 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5788 /* Don't forget to update SRC, DST, and PEND. */
5789 src = GAP_END_ADDR - len_byte;
5790 dst = GPT_ADDR + inserted_byte;
5791 pend = dst;
5792 }
5793 inserted += count;
5794 inserted_byte += count;
5795 coding->produced += count;
5796 p = dst = pend + count;
5797 while (count)
5798 {
5799 *--p = *--pend;
5800 if (*p == '\n') count--, *--p = '\r';
5801 }
5802 }
5803
5804 /* Suppress eol-format conversion in the further conversion. */
5805 coding->eol_type = CODING_EOL_LF;
5806
5807 /* Set the coding system symbol to that for Unix-like EOL. */
5808 eol_type = Fget (saved_coding_symbol, Qeol_type);
5809 if (VECTORP (eol_type)
5810 && XVECTOR (eol_type)->size == 3
5811 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5812 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5813 else
5814 coding->symbol = saved_coding_symbol;
5815
5816 continue;
5817 }
5818 if (len_byte <= 0)
5819 {
5820 if (coding->type != coding_type_ccl
5821 || coding->mode & CODING_MODE_LAST_BLOCK)
5822 break;
5823 coding->mode |= CODING_MODE_LAST_BLOCK;
5824 continue;
5825 }
5826 if (result == CODING_FINISH_INSUFFICIENT_SRC)
5827 {
5828 /* The source text ends in invalid codes. Let's just
5829 make them valid buffer contents, and finish conversion. */
5830 if (multibyte_p)
5831 {
5832 unsigned char *start = dst;
5833
5834 inserted += len_byte;
5835 while (len_byte--)
5836 {
5837 int c = *src++;
5838 dst += CHAR_STRING (c, dst);
5839 }
5840
5841 inserted_byte += dst - start;
5842 }
5843 else
5844 {
5845 inserted += len_byte;
5846 inserted_byte += len_byte;
5847 while (len_byte--)
5848 *dst++ = *src++;
5849 }
5850 break;
5851 }
5852 if (result == CODING_FINISH_INTERRUPT)
5853 {
5854 /* The conversion procedure was interrupted by a user. */
5855 break;
5856 }
5857 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
5858 if (coding->consumed < 1)
5859 {
5860 /* It's quite strange to require more memory without
5861 consuming any bytes. Perhaps CCL program bug. */
5862 break;
5863 }
5864 if (first)
5865 {
5866 /* We have just done the first batch of conversion which was
5867 stopped because of insufficient gap. Let's reconsider the
5868 required gap size (i.e. SRT - DST) now.
5869
5870 We have converted ORIG bytes (== coding->consumed) into
5871 NEW bytes (coding->produced). To convert the remaining
5872 LEN bytes, we may need REQUIRE bytes of gap, where:
5873 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5874 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5875 Here, we are sure that NEW >= ORIG. */
5876 float ratio;
5877
5878 if (coding->produced <= coding->consumed)
5879 {
5880 /* This happens because of CCL-based coding system with
5881 eol-type CRLF. */
5882 require = 0;
5883 }
5884 else
5885 {
5886 ratio = (coding->produced - coding->consumed) / coding->consumed;
5887 require = len_byte * ratio;
5888 }
5889 first = 0;
5890 }
5891 if ((src - dst) < (require + 2000))
5892 {
5893 /* See the comment above the previous call of make_gap. */
5894 int add = len_byte + inserted_byte;
5895
5896 GAP_SIZE -= add;
5897 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5898 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5899 make_gap (require + 2000);
5900 GAP_SIZE += add;
5901 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5902 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5903 }
5904 }
5905 if (src - dst > 0) *dst = 0; /* Put an anchor. */
5906
5907 if (encodep && coding->dst_multibyte)
5908 {
5909 /* The output is unibyte. We must convert 8-bit characters to
5910 multibyte form. */
5911 if (inserted_byte * 2 > GAP_SIZE)
5912 {
5913 GAP_SIZE -= inserted_byte;
5914 ZV += inserted_byte; Z += inserted_byte;
5915 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5916 GPT += inserted_byte; GPT_BYTE += inserted_byte;
5917 make_gap (inserted_byte - GAP_SIZE);
5918 GAP_SIZE += inserted_byte;
5919 ZV -= inserted_byte; Z -= inserted_byte;
5920 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5921 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5922 }
5923 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5924 }
5925
5926 /* If we shrank the conversion area, adjust it now. */
5927 if (total_skip > 0)
5928 {
5929 if (tail_skip > 0)
5930 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5931 inserted += total_skip; inserted_byte += total_skip;
5932 GAP_SIZE += total_skip;
5933 GPT -= head_skip; GPT_BYTE -= head_skip;
5934 ZV -= total_skip; ZV_BYTE -= total_skip;
5935 Z -= total_skip; Z_BYTE -= total_skip;
5936 from -= head_skip; from_byte -= head_skip;
5937 to += tail_skip; to_byte += tail_skip;
5938 }
5939
5940 prev_Z = Z;
5941 if (! EQ (current_buffer->undo_list, Qt))
5942 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5943 else
5944 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5945 inserted, inserted_byte);
5946 inserted = Z - prev_Z;
5947
5948 if (!encodep && coding->cmp_data && coding->cmp_data->used)
5949 coding_restore_composition (coding, Fcurrent_buffer ());
5950 coding_free_composition_data (coding);
5951
5952 if (! inhibit_pre_post_conversion
5953 && ! encodep && ! NILP (coding->post_read_conversion))
5954 {
5955 Lisp_Object val;
5956 Lisp_Object saved_coding_system;
5957
5958 if (from != PT)
5959 TEMP_SET_PT_BOTH (from, from_byte);
5960 prev_Z = Z;
5961 record_unwind_protect (code_convert_region_unwind,
5962 Vlast_coding_system_used);
5963 saved_coding_system = Vlast_coding_system_used;
5964 Vlast_coding_system_used = coding->symbol;
5965 /* We should not call any more pre-write/post-read-conversion
5966 functions while this post-read-conversion is running. */
5967 inhibit_pre_post_conversion = 1;
5968 val = call1 (coding->post_read_conversion, make_number (inserted));
5969 inhibit_pre_post_conversion = 0;
5970 coding->symbol = Vlast_coding_system_used;
5971 Vlast_coding_system_used = saved_coding_system;
5972 /* Discard the unwind protect. */
5973 specpdl_ptr--;
5974 CHECK_NUMBER (val);
5975 inserted += Z - prev_Z;
5976 }
5977
5978 if (orig_point >= from)
5979 {
5980 if (orig_point >= from + orig_len)
5981 orig_point += inserted - orig_len;
5982 else
5983 orig_point = from;
5984 TEMP_SET_PT (orig_point);
5985 }
5986
5987 if (replace)
5988 {
5989 signal_after_change (from, to - from, inserted);
5990 update_compositions (from, from + inserted, CHECK_BORDER);
5991 }
5992
5993 {
5994 coding->consumed = to_byte - from_byte;
5995 coding->consumed_char = to - from;
5996 coding->produced = inserted_byte;
5997 coding->produced_char = inserted;
5998 }
5999
6000 return 0;
6001 }
6002
6003 Lisp_Object
6004 run_pre_post_conversion_on_str (str, coding, encodep)
6005 Lisp_Object str;
6006 struct coding_system *coding;
6007 int encodep;
6008 {
6009 int count = SPECPDL_INDEX ();
6010 struct gcpro gcpro1, gcpro2;
6011 int multibyte = STRING_MULTIBYTE (str);
6012 Lisp_Object buffer;
6013 struct buffer *buf;
6014 Lisp_Object old_deactivate_mark;
6015
6016 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6017 record_unwind_protect (code_convert_region_unwind,
6018 Vlast_coding_system_used);
6019 /* It is not crucial to specbind this. */
6020 old_deactivate_mark = Vdeactivate_mark;
6021 GCPRO2 (str, old_deactivate_mark);
6022
6023 buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
6024 buf = XBUFFER (buffer);
6025
6026 delete_all_overlays (buf);
6027 buf->directory = current_buffer->directory;
6028 buf->read_only = Qnil;
6029 buf->filename = Qnil;
6030 buf->undo_list = Qt;
6031 eassert (buf->overlays_before == NULL);
6032 eassert (buf->overlays_after == NULL);
6033
6034 set_buffer_internal (buf);
6035 /* We must insert the contents of STR as is without
6036 unibyte<->multibyte conversion. For that, we adjust the
6037 multibyteness of the working buffer to that of STR. */
6038 Ferase_buffer ();
6039 buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6040
6041 insert_from_string (str, 0, 0,
6042 SCHARS (str), SBYTES (str), 0);
6043 UNGCPRO;
6044 inhibit_pre_post_conversion = 1;
6045 if (encodep)
6046 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6047 else
6048 {
6049 Vlast_coding_system_used = coding->symbol;
6050 TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6051 call1 (coding->post_read_conversion, make_number (Z - BEG));
6052 coding->symbol = Vlast_coding_system_used;
6053 }
6054 inhibit_pre_post_conversion = 0;
6055 Vdeactivate_mark = old_deactivate_mark;
6056 str = make_buffer_string (BEG, Z, 1);
6057 return unbind_to (count, str);
6058 }
6059
6060 Lisp_Object
6061 decode_coding_string (str, coding, nocopy)
6062 Lisp_Object str;
6063 struct coding_system *coding;
6064 int nocopy;
6065 {
6066 int len;
6067 struct conversion_buffer buf;
6068 int from, to_byte;
6069 Lisp_Object saved_coding_symbol;
6070 int result;
6071 int require_decoding;
6072 int shrinked_bytes = 0;
6073 Lisp_Object newstr;
6074 int consumed, consumed_char, produced, produced_char;
6075
6076 from = 0;
6077 to_byte = SBYTES (str);
6078
6079 saved_coding_symbol = coding->symbol;
6080 coding->src_multibyte = STRING_MULTIBYTE (str);
6081 coding->dst_multibyte = 1;
6082 if (CODING_REQUIRE_DETECTION (coding))
6083 {
6084 /* See the comments in code_convert_region. */
6085 if (coding->type == coding_type_undecided)
6086 {
6087 detect_coding (coding, SDATA (str), to_byte);
6088 if (coding->type == coding_type_undecided)
6089 {
6090 coding->type = coding_type_emacs_mule;
6091 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6092 /* As emacs-mule decoder will handle composition, we
6093 need this setting to allocate coding->cmp_data
6094 later. */
6095 coding->composing = COMPOSITION_NO;
6096 }
6097 }
6098 if (coding->eol_type == CODING_EOL_UNDECIDED
6099 && coding->type != coding_type_ccl)
6100 {
6101 saved_coding_symbol = coding->symbol;
6102 detect_eol (coding, SDATA (str), to_byte);
6103 if (coding->eol_type == CODING_EOL_UNDECIDED)
6104 coding->eol_type = CODING_EOL_LF;
6105 /* We had better recover the original eol format if we
6106 encounter an inconsistent eol format while decoding. */
6107 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6108 }
6109 }
6110
6111 if (coding->type == coding_type_no_conversion
6112 || coding->type == coding_type_raw_text)
6113 coding->dst_multibyte = 0;
6114
6115 require_decoding = CODING_REQUIRE_DECODING (coding);
6116
6117 if (STRING_MULTIBYTE (str))
6118 {
6119 /* Decoding routines expect the source text to be unibyte. */
6120 str = Fstring_as_unibyte (str);
6121 to_byte = SBYTES (str);
6122 nocopy = 1;
6123 coding->src_multibyte = 0;
6124 }
6125
6126 /* Try to skip the heading and tailing ASCIIs. */
6127 if (require_decoding && coding->type != coding_type_ccl)
6128 {
6129 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6130 0);
6131 if (from == to_byte)
6132 require_decoding = 0;
6133 shrinked_bytes = from + (SBYTES (str) - to_byte);
6134 }
6135
6136 if (!require_decoding
6137 && !(SYMBOLP (coding->post_read_conversion)
6138 && !NILP (Ffboundp (coding->post_read_conversion))))
6139 {
6140 coding->consumed = SBYTES (str);
6141 coding->consumed_char = SCHARS (str);
6142 if (coding->dst_multibyte)
6143 {
6144 str = Fstring_as_multibyte (str);
6145 nocopy = 1;
6146 }
6147 coding->produced = SBYTES (str);
6148 coding->produced_char = SCHARS (str);
6149 return (nocopy ? str : Fcopy_sequence (str));
6150 }
6151
6152 if (coding->composing != COMPOSITION_DISABLED)
6153 coding_allocate_composition_data (coding, from);
6154 len = decoding_buffer_size (coding, to_byte - from);
6155 allocate_conversion_buffer (buf, len);
6156
6157 consumed = consumed_char = produced = produced_char = 0;
6158 while (1)
6159 {
6160 result = decode_coding (coding, SDATA (str) + from + consumed,
6161 buf.data + produced, to_byte - from - consumed,
6162 buf.size - produced);
6163 consumed += coding->consumed;
6164 consumed_char += coding->consumed_char;
6165 produced += coding->produced;
6166 produced_char += coding->produced_char;
6167 if (result == CODING_FINISH_NORMAL
6168 || (result == CODING_FINISH_INSUFFICIENT_SRC
6169 && coding->consumed == 0))
6170 break;
6171 if (result == CODING_FINISH_INSUFFICIENT_CMP)
6172 coding_allocate_composition_data (coding, from + produced_char);
6173 else if (result == CODING_FINISH_INSUFFICIENT_DST)
6174 extend_conversion_buffer (&buf);
6175 else if (result == CODING_FINISH_INCONSISTENT_EOL)
6176 {
6177 Lisp_Object eol_type;
6178
6179 /* Recover the original EOL format. */
6180 if (coding->eol_type == CODING_EOL_CR)
6181 {
6182 unsigned char *p;
6183 for (p = buf.data; p < buf.data + produced; p++)
6184 if (*p == '\n') *p = '\r';
6185 }
6186 else if (coding->eol_type == CODING_EOL_CRLF)
6187 {
6188 int num_eol = 0;
6189 unsigned char *p0, *p1;
6190 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6191 if (*p0 == '\n') num_eol++;
6192 if (produced + num_eol >= buf.size)
6193 extend_conversion_buffer (&buf);
6194 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6195 {
6196 *--p1 = *--p0;
6197 if (*p0 == '\n') *--p1 = '\r';
6198 }
6199 produced += num_eol;
6200 produced_char += num_eol;
6201 }
6202 /* Suppress eol-format conversion in the further conversion. */
6203 coding->eol_type = CODING_EOL_LF;
6204
6205 /* Set the coding system symbol to that for Unix-like EOL. */
6206 eol_type = Fget (saved_coding_symbol, Qeol_type);
6207 if (VECTORP (eol_type)
6208 && XVECTOR (eol_type)->size == 3
6209 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6210 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6211 else
6212 coding->symbol = saved_coding_symbol;
6213
6214
6215 }
6216 }
6217
6218 coding->consumed = consumed;
6219 coding->consumed_char = consumed_char;
6220 coding->produced = produced;
6221 coding->produced_char = produced_char;
6222
6223 if (coding->dst_multibyte)
6224 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6225 produced + shrinked_bytes);
6226 else
6227 newstr = make_uninit_string (produced + shrinked_bytes);
6228 if (from > 0)
6229 STRING_COPYIN (newstr, 0, SDATA (str), from);
6230 STRING_COPYIN (newstr, from, buf.data, produced);
6231 if (shrinked_bytes > from)
6232 STRING_COPYIN (newstr, from + produced,
6233 SDATA (str) + to_byte,
6234 shrinked_bytes - from);
6235 free_conversion_buffer (&buf);
6236
6237 coding->consumed += shrinked_bytes;
6238 coding->consumed_char += shrinked_bytes;
6239 coding->produced += shrinked_bytes;
6240 coding->produced_char += shrinked_bytes;
6241
6242 if (coding->cmp_data && coding->cmp_data->used)
6243 coding_restore_composition (coding, newstr);
6244 coding_free_composition_data (coding);
6245
6246 if (SYMBOLP (coding->post_read_conversion)
6247 && !NILP (Ffboundp (coding->post_read_conversion)))
6248 newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6249
6250 return newstr;
6251 }
6252
6253 Lisp_Object
6254 encode_coding_string (str, coding, nocopy)
6255 Lisp_Object str;
6256 struct coding_system *coding;
6257 int nocopy;
6258 {
6259 int len;
6260 struct conversion_buffer buf;
6261 int from, to, to_byte;
6262 int result;
6263 int shrinked_bytes = 0;
6264 Lisp_Object newstr;
6265 int consumed, consumed_char, produced, produced_char;
6266
6267 if (SYMBOLP (coding->pre_write_conversion)
6268 && !NILP (Ffboundp (coding->pre_write_conversion)))
6269 str = run_pre_post_conversion_on_str (str, coding, 1);
6270
6271 from = 0;
6272 to = SCHARS (str);
6273 to_byte = SBYTES (str);
6274
6275 /* Encoding routines determine the multibyteness of the source text
6276 by coding->src_multibyte. */
6277 coding->src_multibyte = STRING_MULTIBYTE (str);
6278 coding->dst_multibyte = 0;
6279 if (! CODING_REQUIRE_ENCODING (coding))
6280 {
6281 coding->consumed = SBYTES (str);
6282 coding->consumed_char = SCHARS (str);
6283 if (STRING_MULTIBYTE (str))
6284 {
6285 str = Fstring_as_unibyte (str);
6286 nocopy = 1;
6287 }
6288 coding->produced = SBYTES (str);
6289 coding->produced_char = SCHARS (str);
6290 return (nocopy ? str : Fcopy_sequence (str));
6291 }
6292
6293 if (coding->composing != COMPOSITION_DISABLED)
6294 coding_save_composition (coding, from, to, str);
6295
6296 /* Try to skip the heading and tailing ASCIIs. */
6297 if (coding->type != coding_type_ccl)
6298 {
6299 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6300 1);
6301 if (from == to_byte)
6302 return (nocopy ? str : Fcopy_sequence (str));
6303 shrinked_bytes = from + (SBYTES (str) - to_byte);
6304 }
6305
6306 len = encoding_buffer_size (coding, to_byte - from);
6307 allocate_conversion_buffer (buf, len);
6308
6309 consumed = consumed_char = produced = produced_char = 0;
6310 while (1)
6311 {
6312 result = encode_coding (coding, SDATA (str) + from + consumed,
6313 buf.data + produced, to_byte - from - consumed,
6314 buf.size - produced);
6315 consumed += coding->consumed;
6316 consumed_char += coding->consumed_char;
6317 produced += coding->produced;
6318 produced_char += coding->produced_char;
6319 if (result == CODING_FINISH_NORMAL
6320 || result == CODING_FINISH_INTERRUPT
6321 || (result == CODING_FINISH_INSUFFICIENT_SRC
6322 && coding->consumed == 0))
6323 break;
6324 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */
6325 extend_conversion_buffer (&buf);
6326 }
6327
6328 coding->consumed = consumed;
6329 coding->consumed_char = consumed_char;
6330 coding->produced = produced;
6331 coding->produced_char = produced_char;
6332
6333 newstr = make_uninit_string (produced + shrinked_bytes);
6334 if (from > 0)
6335 STRING_COPYIN (newstr, 0, SDATA (str), from);
6336 STRING_COPYIN (newstr, from, buf.data, produced);
6337 if (shrinked_bytes > from)
6338 STRING_COPYIN (newstr, from + produced,
6339 SDATA (str) + to_byte,
6340 shrinked_bytes - from);
6341
6342 free_conversion_buffer (&buf);
6343 coding_free_composition_data (coding);
6344
6345 return newstr;
6346 }
6347
6348 \f
6349 #ifdef emacs
6350 /*** 8. Emacs Lisp library functions ***/
6351
6352 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6353 doc: /* Return t if OBJECT is nil or a coding-system.
6354 See the documentation of `make-coding-system' for information
6355 about coding-system objects. */)
6356 (obj)
6357 Lisp_Object obj;
6358 {
6359 if (NILP (obj))
6360 return Qt;
6361 if (!SYMBOLP (obj))
6362 return Qnil;
6363 if (! NILP (Fget (obj, Qcoding_system_define_form)))
6364 return Qt;
6365 /* Get coding-spec vector for OBJ. */
6366 obj = Fget (obj, Qcoding_system);
6367 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6368 ? Qt : Qnil);
6369 }
6370
6371 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6372 Sread_non_nil_coding_system, 1, 1, 0,
6373 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6374 (prompt)
6375 Lisp_Object prompt;
6376 {
6377 Lisp_Object val;
6378 do
6379 {
6380 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6381 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6382 }
6383 while (SCHARS (val) == 0);
6384 return (Fintern (val, Qnil));
6385 }
6386
6387 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6388 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6389 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6390 (prompt, default_coding_system)
6391 Lisp_Object prompt, default_coding_system;
6392 {
6393 Lisp_Object val;
6394 if (SYMBOLP (default_coding_system))
6395 default_coding_system = SYMBOL_NAME (default_coding_system);
6396 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6397 Qt, Qnil, Qcoding_system_history,
6398 default_coding_system, Qnil);
6399 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6400 }
6401
6402 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6403 1, 1, 0,
6404 doc: /* Check validity of CODING-SYSTEM.
6405 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6406 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6407 The value of this property should be a vector of length 5. */)
6408 (coding_system)
6409 Lisp_Object coding_system;
6410 {
6411 Lisp_Object define_form;
6412
6413 define_form = Fget (coding_system, Qcoding_system_define_form);
6414 if (! NILP (define_form))
6415 {
6416 Fput (coding_system, Qcoding_system_define_form, Qnil);
6417 safe_eval (define_form);
6418 }
6419 if (!NILP (Fcoding_system_p (coding_system)))
6420 return coding_system;
6421 while (1)
6422 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6423 }
6424 \f
6425 Lisp_Object
6426 detect_coding_system (src, src_bytes, highest, multibytep)
6427 const unsigned char *src;
6428 int src_bytes, highest;
6429 int multibytep;
6430 {
6431 int coding_mask, eol_type;
6432 Lisp_Object val, tmp;
6433 int dummy;
6434
6435 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6436 eol_type = detect_eol_type (src, src_bytes, &dummy);
6437 if (eol_type == CODING_EOL_INCONSISTENT)
6438 eol_type = CODING_EOL_UNDECIDED;
6439
6440 if (!coding_mask)
6441 {
6442 val = Qundecided;
6443 if (eol_type != CODING_EOL_UNDECIDED)
6444 {
6445 Lisp_Object val2;
6446 val2 = Fget (Qundecided, Qeol_type);
6447 if (VECTORP (val2))
6448 val = XVECTOR (val2)->contents[eol_type];
6449 }
6450 return (highest ? val : Fcons (val, Qnil));
6451 }
6452
6453 /* At first, gather possible coding systems in VAL. */
6454 val = Qnil;
6455 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6456 {
6457 Lisp_Object category_val, category_index;
6458
6459 category_index = Fget (XCAR (tmp), Qcoding_category_index);
6460 category_val = Fsymbol_value (XCAR (tmp));
6461 if (!NILP (category_val)
6462 && NATNUMP (category_index)
6463 && (coding_mask & (1 << XFASTINT (category_index))))
6464 {
6465 val = Fcons (category_val, val);
6466 if (highest)
6467 break;
6468 }
6469 }
6470 if (!highest)
6471 val = Fnreverse (val);
6472
6473 /* Then, replace the elements with subsidiary coding systems. */
6474 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6475 {
6476 if (eol_type != CODING_EOL_UNDECIDED
6477 && eol_type != CODING_EOL_INCONSISTENT)
6478 {
6479 Lisp_Object eol;
6480 eol = Fget (XCAR (tmp), Qeol_type);
6481 if (VECTORP (eol))
6482 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6483 }
6484 }
6485 return (highest ? XCAR (val) : val);
6486 }
6487
6488 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6489 2, 3, 0,
6490 doc: /* Detect how the byte sequence in the region is encoded.
6491 Return a list of possible coding systems used on decoding a byte
6492 sequence containing the bytes in the region between START and END when
6493 the coding system `undecided' is specified. The list is ordered by
6494 priority decided in the current language environment.
6495
6496 If only ASCII characters are found, it returns a list of single element
6497 `undecided' or its subsidiary coding system according to a detected
6498 end-of-line format.
6499
6500 If optional argument HIGHEST is non-nil, return the coding system of
6501 highest priority. */)
6502 (start, end, highest)
6503 Lisp_Object start, end, highest;
6504 {
6505 int from, to;
6506 int from_byte, to_byte;
6507 int include_anchor_byte = 0;
6508
6509 CHECK_NUMBER_COERCE_MARKER (start);
6510 CHECK_NUMBER_COERCE_MARKER (end);
6511
6512 validate_region (&start, &end);
6513 from = XINT (start), to = XINT (end);
6514 from_byte = CHAR_TO_BYTE (from);
6515 to_byte = CHAR_TO_BYTE (to);
6516
6517 if (from < GPT && to >= GPT)
6518 move_gap_both (to, to_byte);
6519 /* If we an anchor byte `\0' follows the region, we include it in
6520 the detecting source. Then code detectors can handle the tailing
6521 byte sequence more accurately.
6522
6523 Fix me: This is not a perfect solution. It is better that we
6524 add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6525 */
6526 if (to == Z || (to == GPT && GAP_SIZE > 0))
6527 include_anchor_byte = 1;
6528 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6529 to_byte - from_byte + include_anchor_byte,
6530 !NILP (highest),
6531 !NILP (current_buffer
6532 ->enable_multibyte_characters));
6533 }
6534
6535 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6536 1, 2, 0,
6537 doc: /* Detect how the byte sequence in STRING is encoded.
6538 Return a list of possible coding systems used on decoding a byte
6539 sequence containing the bytes in STRING when the coding system
6540 `undecided' is specified. The list is ordered by priority decided in
6541 the current language environment.
6542
6543 If only ASCII characters are found, it returns a list of single element
6544 `undecided' or its subsidiary coding system according to a detected
6545 end-of-line format.
6546
6547 If optional argument HIGHEST is non-nil, return the coding system of
6548 highest priority. */)
6549 (string, highest)
6550 Lisp_Object string, highest;
6551 {
6552 CHECK_STRING (string);
6553
6554 return detect_coding_system (SDATA (string),
6555 /* "+ 1" is to include the anchor byte
6556 `\0'. With this, code detectors can
6557 handle the tailing bytes more
6558 accurately. */
6559 SBYTES (string) + 1,
6560 !NILP (highest),
6561 STRING_MULTIBYTE (string));
6562 }
6563
6564 /* Subroutine for Fsafe_coding_systems_region_internal.
6565
6566 Return a list of coding systems that safely encode the multibyte
6567 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
6568 possible coding systems. If it is nil, it means that we have not
6569 yet found any coding systems.
6570
6571 WORK_TABLE a char-table of which element is set to t once the
6572 element is looked up.
6573
6574 If a non-ASCII single byte char is found, set
6575 *single_byte_char_found to 1. */
6576
6577 static Lisp_Object
6578 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6579 unsigned char *p, *pend;
6580 Lisp_Object safe_codings, work_table;
6581 int *single_byte_char_found;
6582 {
6583 int c, len;
6584 Lisp_Object val, ch;
6585 Lisp_Object prev, tail;
6586
6587 if (NILP (safe_codings))
6588 goto done_safe_codings;
6589 while (p < pend)
6590 {
6591 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6592 p += len;
6593 if (ASCII_BYTE_P (c))
6594 /* We can ignore ASCII characters here. */
6595 continue;
6596 if (SINGLE_BYTE_CHAR_P (c))
6597 *single_byte_char_found = 1;
6598 /* Check the safe coding systems for C. */
6599 ch = make_number (c);
6600 val = Faref (work_table, ch);
6601 if (EQ (val, Qt))
6602 /* This element was already checked. Ignore it. */
6603 continue;
6604 /* Remember that we checked this element. */
6605 Faset (work_table, ch, Qt);
6606
6607 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6608 {
6609 Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6610 int encodable;
6611
6612 elt = XCAR (tail);
6613 if (CONSP (XCDR (elt)))
6614 {
6615 /* This entry has this format now:
6616 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6617 ACCEPT-LATIN-EXTRA ) */
6618 val = XCDR (elt);
6619 encodable = ! NILP (Faref (XCAR (val), ch));
6620 if (! encodable)
6621 {
6622 val = XCDR (val);
6623 translation_table = XCAR (val);
6624 hash_table = XCAR (XCDR (val));
6625 accept_latin_extra = XCAR (XCDR (XCDR (val)));
6626 }
6627 }
6628 else
6629 {
6630 /* This entry has this format now: ( CODING . SAFE-CHARS) */
6631 encodable = ! NILP (Faref (XCDR (elt), ch));
6632 if (! encodable)
6633 {
6634 /* Transform the format to:
6635 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6636 ACCEPT-LATIN-EXTRA ) */
6637 val = Fget (XCAR (elt), Qcoding_system);
6638 translation_table
6639 = Fplist_get (AREF (val, 3),
6640 Qtranslation_table_for_encode);
6641 if (SYMBOLP (translation_table))
6642 translation_table = Fget (translation_table,
6643 Qtranslation_table);
6644 hash_table
6645 = (CHAR_TABLE_P (translation_table)
6646 ? XCHAR_TABLE (translation_table)->extras[1]
6647 : Qnil);
6648 accept_latin_extra
6649 = ((EQ (AREF (val, 0), make_number (2))
6650 && VECTORP (AREF (val, 4)))
6651 ? AREF (AREF (val, 4), 16)
6652 : Qnil);
6653 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6654 translation_table, hash_table,
6655 accept_latin_extra));
6656 }
6657 }
6658
6659 if (! encodable
6660 && ((CHAR_TABLE_P (translation_table)
6661 && ! NILP (Faref (translation_table, ch)))
6662 || (HASH_TABLE_P (hash_table)
6663 && ! NILP (Fgethash (ch, hash_table, Qnil)))
6664 || (SINGLE_BYTE_CHAR_P (c)
6665 && ! NILP (accept_latin_extra)
6666 && VECTORP (Vlatin_extra_code_table)
6667 && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6668 encodable = 1;
6669 if (encodable)
6670 prev = tail;
6671 else
6672 {
6673 /* Exclude this coding system from SAFE_CODINGS. */
6674 if (EQ (tail, safe_codings))
6675 {
6676 safe_codings = XCDR (safe_codings);
6677 if (NILP (safe_codings))
6678 goto done_safe_codings;
6679 }
6680 else
6681 XSETCDR (prev, XCDR (tail));
6682 }
6683 }
6684 }
6685
6686 done_safe_codings:
6687 /* If the above loop was terminated before P reaches PEND, it means
6688 SAFE_CODINGS was set to nil. If we have not yet found an
6689 non-ASCII single-byte char, check it now. */
6690 if (! *single_byte_char_found)
6691 while (p < pend)
6692 {
6693 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6694 p += len;
6695 if (! ASCII_BYTE_P (c)
6696 && SINGLE_BYTE_CHAR_P (c))
6697 {
6698 *single_byte_char_found = 1;
6699 break;
6700 }
6701 }
6702 return safe_codings;
6703 }
6704
6705 DEFUN ("find-coding-systems-region-internal",
6706 Ffind_coding_systems_region_internal,
6707 Sfind_coding_systems_region_internal, 2, 2, 0,
6708 doc: /* Internal use only. */)
6709 (start, end)
6710 Lisp_Object start, end;
6711 {
6712 Lisp_Object work_table, safe_codings;
6713 int non_ascii_p = 0;
6714 int single_byte_char_found = 0;
6715 const unsigned char *p1, *p1end, *p2, *p2end, *p;
6716
6717 if (STRINGP (start))
6718 {
6719 if (!STRING_MULTIBYTE (start))
6720 return Qt;
6721 p1 = SDATA (start), p1end = p1 + SBYTES (start);
6722 p2 = p2end = p1end;
6723 if (SCHARS (start) != SBYTES (start))
6724 non_ascii_p = 1;
6725 }
6726 else
6727 {
6728 int from, to, stop;
6729
6730 CHECK_NUMBER_COERCE_MARKER (start);
6731 CHECK_NUMBER_COERCE_MARKER (end);
6732 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6733 args_out_of_range (start, end);
6734 if (NILP (current_buffer->enable_multibyte_characters))
6735 return Qt;
6736 from = CHAR_TO_BYTE (XINT (start));
6737 to = CHAR_TO_BYTE (XINT (end));
6738 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6739 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6740 if (stop == to)
6741 p2 = p2end = p1end;
6742 else
6743 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6744 if (XINT (end) - XINT (start) != to - from)
6745 non_ascii_p = 1;
6746 }
6747
6748 if (!non_ascii_p)
6749 {
6750 /* We are sure that the text contains no multibyte character.
6751 Check if it contains eight-bit-graphic. */
6752 p = p1;
6753 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6754 if (p == p1end)
6755 {
6756 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6757 if (p == p2end)
6758 return Qt;
6759 }
6760 }
6761
6762 /* The text contains non-ASCII characters. */
6763
6764 work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6765 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6766
6767 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6768 &single_byte_char_found);
6769 if (p2 < p2end)
6770 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6771 &single_byte_char_found);
6772 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6773 safe_codings = Qt;
6774 else
6775 {
6776 /* Turn safe_codings to a list of coding systems... */
6777 Lisp_Object val;
6778
6779 if (single_byte_char_found)
6780 /* ... and append these for eight-bit chars. */
6781 val = Fcons (Qraw_text,
6782 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6783 else
6784 /* ... and append generic coding systems. */
6785 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6786
6787 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6788 val = Fcons (XCAR (XCAR (safe_codings)), val);
6789 safe_codings = val;
6790 }
6791
6792 return safe_codings;
6793 }
6794
6795
6796 /* Search from position POS for such characters that are unencodable
6797 accoding to SAFE_CHARS, and return a list of their positions. P
6798 points where in the memory the character at POS exists. Limit the
6799 search at PEND or when Nth unencodable characters are found.
6800
6801 If SAFE_CHARS is a char table, an element for an unencodable
6802 character is nil.
6803
6804 If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6805
6806 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6807 eight-bit-graphic characters are unencodable. */
6808
6809 static Lisp_Object
6810 unencodable_char_position (safe_chars, pos, p, pend, n)
6811 Lisp_Object safe_chars;
6812 int pos;
6813 unsigned char *p, *pend;
6814 int n;
6815 {
6816 Lisp_Object pos_list;
6817
6818 pos_list = Qnil;
6819 while (p < pend)
6820 {
6821 int len;
6822 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6823
6824 if (c >= 128
6825 && (CHAR_TABLE_P (safe_chars)
6826 ? NILP (CHAR_TABLE_REF (safe_chars, c))
6827 : (NILP (safe_chars) || c < 256)))
6828 {
6829 pos_list = Fcons (make_number (pos), pos_list);
6830 if (--n <= 0)
6831 break;
6832 }
6833 pos++;
6834 p += len;
6835 }
6836 return Fnreverse (pos_list);
6837 }
6838
6839
6840 DEFUN ("unencodable-char-position", Funencodable_char_position,
6841 Sunencodable_char_position, 3, 5, 0,
6842 doc: /*
6843 Return position of first un-encodable character in a region.
6844 START and END specfiy the region and CODING-SYSTEM specifies the
6845 encoding to check. Return nil if CODING-SYSTEM does encode the region.
6846
6847 If optional 4th argument COUNT is non-nil, it specifies at most how
6848 many un-encodable characters to search. In this case, the value is a
6849 list of positions.
6850
6851 If optional 5th argument STRING is non-nil, it is a string to search
6852 for un-encodable characters. In that case, START and END are indexes
6853 to the string. */)
6854 (start, end, coding_system, count, string)
6855 Lisp_Object start, end, coding_system, count, string;
6856 {
6857 int n;
6858 Lisp_Object safe_chars;
6859 struct coding_system coding;
6860 Lisp_Object positions;
6861 int from, to;
6862 unsigned char *p, *pend;
6863
6864 if (NILP (string))
6865 {
6866 validate_region (&start, &end);
6867 from = XINT (start);
6868 to = XINT (end);
6869 if (NILP (current_buffer->enable_multibyte_characters))
6870 return Qnil;
6871 p = CHAR_POS_ADDR (from);
6872 if (to == GPT)
6873 pend = GPT_ADDR;
6874 else
6875 pend = CHAR_POS_ADDR (to);
6876 }
6877 else
6878 {
6879 CHECK_STRING (string);
6880 CHECK_NATNUM (start);
6881 CHECK_NATNUM (end);
6882 from = XINT (start);
6883 to = XINT (end);
6884 if (from > to
6885 || to > SCHARS (string))
6886 args_out_of_range_3 (string, start, end);
6887 if (! STRING_MULTIBYTE (string))
6888 return Qnil;
6889 p = SDATA (string) + string_char_to_byte (string, from);
6890 pend = SDATA (string) + string_char_to_byte (string, to);
6891 }
6892
6893 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6894
6895 if (NILP (count))
6896 n = 1;
6897 else
6898 {
6899 CHECK_NATNUM (count);
6900 n = XINT (count);
6901 }
6902
6903 if (coding.type == coding_type_no_conversion
6904 || coding.type == coding_type_raw_text)
6905 return Qnil;
6906
6907 if (coding.type == coding_type_undecided)
6908 safe_chars = Qnil;
6909 else
6910 safe_chars = coding_safe_chars (coding_system);
6911
6912 if (STRINGP (string)
6913 || from >= GPT || to <= GPT)
6914 positions = unencodable_char_position (safe_chars, from, p, pend, n);
6915 else
6916 {
6917 Lisp_Object args[2];
6918
6919 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6920 n -= XINT (Flength (args[0]));
6921 if (n <= 0)
6922 positions = args[0];
6923 else
6924 {
6925 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6926 pend, n);
6927 positions = Fappend (2, args);
6928 }
6929 }
6930
6931 return (NILP (count) ? Fcar (positions) : positions);
6932 }
6933
6934
6935 Lisp_Object
6936 code_convert_region1 (start, end, coding_system, encodep)
6937 Lisp_Object start, end, coding_system;
6938 int encodep;
6939 {
6940 struct coding_system coding;
6941 int from, to;
6942
6943 CHECK_NUMBER_COERCE_MARKER (start);
6944 CHECK_NUMBER_COERCE_MARKER (end);
6945 CHECK_SYMBOL (coding_system);
6946
6947 validate_region (&start, &end);
6948 from = XFASTINT (start);
6949 to = XFASTINT (end);
6950
6951 if (NILP (coding_system))
6952 return make_number (to - from);
6953
6954 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6955 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6956
6957 coding.mode |= CODING_MODE_LAST_BLOCK;
6958 coding.src_multibyte = coding.dst_multibyte
6959 = !NILP (current_buffer->enable_multibyte_characters);
6960 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6961 &coding, encodep, 1);
6962 Vlast_coding_system_used = coding.symbol;
6963 return make_number (coding.produced_char);
6964 }
6965
6966 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6967 3, 3, "r\nzCoding system: ",
6968 doc: /* Decode the current region from the specified coding system.
6969 When called from a program, takes three arguments:
6970 START, END, and CODING-SYSTEM. START and END are buffer positions.
6971 This function sets `last-coding-system-used' to the precise coding system
6972 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6973 not fully specified.)
6974 It returns the length of the decoded text. */)
6975 (start, end, coding_system)
6976 Lisp_Object start, end, coding_system;
6977 {
6978 return code_convert_region1 (start, end, coding_system, 0);
6979 }
6980
6981 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6982 3, 3, "r\nzCoding system: ",
6983 doc: /* Encode the current region into the specified coding system.
6984 When called from a program, takes three arguments:
6985 START, END, and CODING-SYSTEM. START and END are buffer positions.
6986 This function sets `last-coding-system-used' to the precise coding system
6987 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6988 not fully specified.)
6989 It returns the length of the encoded text. */)
6990 (start, end, coding_system)
6991 Lisp_Object start, end, coding_system;
6992 {
6993 return code_convert_region1 (start, end, coding_system, 1);
6994 }
6995
6996 Lisp_Object
6997 code_convert_string1 (string, coding_system, nocopy, encodep)
6998 Lisp_Object string, coding_system, nocopy;
6999 int encodep;
7000 {
7001 struct coding_system coding;
7002
7003 CHECK_STRING (string);
7004 CHECK_SYMBOL (coding_system);
7005
7006 if (NILP (coding_system))
7007 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7008
7009 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7010 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7011
7012 coding.mode |= CODING_MODE_LAST_BLOCK;
7013 string = (encodep
7014 ? encode_coding_string (string, &coding, !NILP (nocopy))
7015 : decode_coding_string (string, &coding, !NILP (nocopy)));
7016 Vlast_coding_system_used = coding.symbol;
7017
7018 return string;
7019 }
7020
7021 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7022 2, 3, 0,
7023 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7024 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7025 if the decoding operation is trivial.
7026 This function sets `last-coding-system-used' to the precise coding system
7027 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7028 not fully specified.) */)
7029 (string, coding_system, nocopy)
7030 Lisp_Object string, coding_system, nocopy;
7031 {
7032 return code_convert_string1 (string, coding_system, nocopy, 0);
7033 }
7034
7035 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7036 2, 3, 0,
7037 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7038 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7039 if the encoding operation is trivial.
7040 This function sets `last-coding-system-used' to the precise coding system
7041 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7042 not fully specified.) */)
7043 (string, coding_system, nocopy)
7044 Lisp_Object string, coding_system, nocopy;
7045 {
7046 return code_convert_string1 (string, coding_system, nocopy, 1);
7047 }
7048
7049 /* Encode or decode STRING according to CODING_SYSTEM.
7050 Do not set Vlast_coding_system_used.
7051
7052 This function is called only from macros DECODE_FILE and
7053 ENCODE_FILE, thus we ignore character composition. */
7054
7055 Lisp_Object
7056 code_convert_string_norecord (string, coding_system, encodep)
7057 Lisp_Object string, coding_system;
7058 int encodep;
7059 {
7060 struct coding_system coding;
7061
7062 CHECK_STRING (string);
7063 CHECK_SYMBOL (coding_system);
7064
7065 if (NILP (coding_system))
7066 return string;
7067
7068 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7069 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7070
7071 coding.composing = COMPOSITION_DISABLED;
7072 coding.mode |= CODING_MODE_LAST_BLOCK;
7073 return (encodep
7074 ? encode_coding_string (string, &coding, 1)
7075 : decode_coding_string (string, &coding, 1));
7076 }
7077 \f
7078 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7079 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7080 Return the corresponding character. */)
7081 (code)
7082 Lisp_Object code;
7083 {
7084 unsigned char c1, c2, s1, s2;
7085 Lisp_Object val;
7086
7087 CHECK_NUMBER (code);
7088 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7089 if (s1 == 0)
7090 {
7091 if (s2 < 0x80)
7092 XSETFASTINT (val, s2);
7093 else if (s2 >= 0xA0 || s2 <= 0xDF)
7094 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7095 else
7096 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7097 }
7098 else
7099 {
7100 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7101 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7102 error ("Invalid Shift JIS code: %x", XFASTINT (code));
7103 DECODE_SJIS (s1, s2, c1, c2);
7104 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7105 }
7106 return val;
7107 }
7108
7109 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7110 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7111 Return the corresponding code in SJIS. */)
7112 (ch)
7113 Lisp_Object ch;
7114 {
7115 int charset, c1, c2, s1, s2;
7116 Lisp_Object val;
7117
7118 CHECK_NUMBER (ch);
7119 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7120 if (charset == CHARSET_ASCII)
7121 {
7122 val = ch;
7123 }
7124 else if (charset == charset_jisx0208
7125 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7126 {
7127 ENCODE_SJIS (c1, c2, s1, s2);
7128 XSETFASTINT (val, (s1 << 8) | s2);
7129 }
7130 else if (charset == charset_katakana_jisx0201
7131 && c1 > 0x20 && c2 < 0xE0)
7132 {
7133 XSETFASTINT (val, c1 | 0x80);
7134 }
7135 else
7136 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7137 return val;
7138 }
7139
7140 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7141 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7142 Return the corresponding character. */)
7143 (code)
7144 Lisp_Object code;
7145 {
7146 int charset;
7147 unsigned char b1, b2, c1, c2;
7148 Lisp_Object val;
7149
7150 CHECK_NUMBER (code);
7151 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7152 if (b1 == 0)
7153 {
7154 if (b2 >= 0x80)
7155 error ("Invalid BIG5 code: %x", XFASTINT (code));
7156 val = code;
7157 }
7158 else
7159 {
7160 if ((b1 < 0xA1 || b1 > 0xFE)
7161 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7162 error ("Invalid BIG5 code: %x", XFASTINT (code));
7163 DECODE_BIG5 (b1, b2, charset, c1, c2);
7164 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7165 }
7166 return val;
7167 }
7168
7169 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7170 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7171 Return the corresponding character code in Big5. */)
7172 (ch)
7173 Lisp_Object ch;
7174 {
7175 int charset, c1, c2, b1, b2;
7176 Lisp_Object val;
7177
7178 CHECK_NUMBER (ch);
7179 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7180 if (charset == CHARSET_ASCII)
7181 {
7182 val = ch;
7183 }
7184 else if ((charset == charset_big5_1
7185 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7186 || (charset == charset_big5_2
7187 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7188 {
7189 ENCODE_BIG5 (charset, c1, c2, b1, b2);
7190 XSETFASTINT (val, (b1 << 8) | b2);
7191 }
7192 else
7193 error ("Can't encode to Big5: %d", XFASTINT (ch));
7194 return val;
7195 }
7196 \f
7197 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7198 Sset_terminal_coding_system_internal, 1, 1, 0,
7199 doc: /* Internal use only. */)
7200 (coding_system)
7201 Lisp_Object coding_system;
7202 {
7203 struct coding_system *terminal_coding = FRAME_TERMINAL_CODING (SELECTED_FRAME ());
7204 CHECK_SYMBOL (coding_system);
7205 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7206 /* We had better not send unsafe characters to terminal. */
7207 terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7208 /* Character composition should be disabled. */
7209 terminal_coding->composing = COMPOSITION_DISABLED;
7210 /* Error notification should be suppressed. */
7211 terminal_coding->suppress_error = 1;
7212 terminal_coding->src_multibyte = 1;
7213 terminal_coding->dst_multibyte = 0;
7214 return Qnil;
7215 }
7216
7217 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7218 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7219 doc: /* Internal use only. */)
7220 (coding_system)
7221 Lisp_Object coding_system;
7222 {
7223 CHECK_SYMBOL (coding_system);
7224 setup_coding_system (Fcheck_coding_system (coding_system),
7225 &safe_terminal_coding);
7226 /* Character composition should be disabled. */
7227 safe_terminal_coding.composing = COMPOSITION_DISABLED;
7228 /* Error notification should be suppressed. */
7229 safe_terminal_coding.suppress_error = 1;
7230 safe_terminal_coding.src_multibyte = 1;
7231 safe_terminal_coding.dst_multibyte = 0;
7232 return Qnil;
7233 }
7234
7235 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7236 Sterminal_coding_system, 0, 0, 0,
7237 doc: /* Return coding system specified for terminal output. */)
7238 ()
7239 {
7240 return FRAME_TERMINAL_CODING (SELECTED_FRAME ())->symbol;
7241 }
7242
7243 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7244 Sset_keyboard_coding_system_internal, 1, 1, 0,
7245 doc: /* Internal use only. */)
7246 (coding_system)
7247 Lisp_Object coding_system;
7248 {
7249 CHECK_SYMBOL (coding_system);
7250 setup_coding_system (Fcheck_coding_system (coding_system),
7251 FRAME_KEYBOARD_CODING (SELECTED_FRAME ()));
7252 /* Character composition should be disabled. */
7253 FRAME_KEYBOARD_CODING (SELECTED_FRAME ())->composing = COMPOSITION_DISABLED;
7254 return Qnil;
7255 }
7256
7257 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7258 Skeyboard_coding_system, 0, 0, 0,
7259 doc: /* Return coding system specified for decoding keyboard input. */)
7260 ()
7261 {
7262 return FRAME_KEYBOARD_CODING (SELECTED_FRAME ())->symbol;
7263 }
7264
7265 \f
7266 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7267 Sfind_operation_coding_system, 1, MANY, 0,
7268 doc: /* Choose a coding system for an operation based on the target name.
7269 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7270 DECODING-SYSTEM is the coding system to use for decoding
7271 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7272 for encoding (in case OPERATION does encoding).
7273
7274 The first argument OPERATION specifies an I/O primitive:
7275 For file I/O, `insert-file-contents' or `write-region'.
7276 For process I/O, `call-process', `call-process-region', or `start-process'.
7277 For network I/O, `open-network-stream'.
7278
7279 The remaining arguments should be the same arguments that were passed
7280 to the primitive. Depending on which primitive, one of those arguments
7281 is selected as the TARGET. For example, if OPERATION does file I/O,
7282 whichever argument specifies the file name is TARGET.
7283
7284 TARGET has a meaning which depends on OPERATION:
7285 For file I/O, TARGET is a file name.
7286 For process I/O, TARGET is a process name.
7287 For network I/O, TARGET is a service name or a port number
7288
7289 This function looks up what specified for TARGET in,
7290 `file-coding-system-alist', `process-coding-system-alist',
7291 or `network-coding-system-alist' depending on OPERATION.
7292 They may specify a coding system, a cons of coding systems,
7293 or a function symbol to call.
7294 In the last case, we call the function with one argument,
7295 which is a list of all the arguments given to this function.
7296
7297 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7298 (nargs, args)
7299 int nargs;
7300 Lisp_Object *args;
7301 {
7302 Lisp_Object operation, target_idx, target, val;
7303 register Lisp_Object chain;
7304
7305 if (nargs < 2)
7306 error ("Too few arguments");
7307 operation = args[0];
7308 if (!SYMBOLP (operation)
7309 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7310 error ("Invalid first argument");
7311 if (nargs < 1 + XINT (target_idx))
7312 error ("Too few arguments for operation: %s",
7313 SDATA (SYMBOL_NAME (operation)));
7314 /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7315 argument to write-region) is string, it must be treated as a
7316 target file name. */
7317 if (EQ (operation, Qwrite_region)
7318 && nargs > 5
7319 && STRINGP (args[5]))
7320 target_idx = make_number (4);
7321 target = args[XINT (target_idx) + 1];
7322 if (!(STRINGP (target)
7323 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7324 error ("Invalid argument %d", XINT (target_idx) + 1);
7325
7326 chain = ((EQ (operation, Qinsert_file_contents)
7327 || EQ (operation, Qwrite_region))
7328 ? Vfile_coding_system_alist
7329 : (EQ (operation, Qopen_network_stream)
7330 ? Vnetwork_coding_system_alist
7331 : Vprocess_coding_system_alist));
7332 if (NILP (chain))
7333 return Qnil;
7334
7335 for (; CONSP (chain); chain = XCDR (chain))
7336 {
7337 Lisp_Object elt;
7338 elt = XCAR (chain);
7339
7340 if (CONSP (elt)
7341 && ((STRINGP (target)
7342 && STRINGP (XCAR (elt))
7343 && fast_string_match (XCAR (elt), target) >= 0)
7344 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7345 {
7346 val = XCDR (elt);
7347 /* Here, if VAL is both a valid coding system and a valid
7348 function symbol, we return VAL as a coding system. */
7349 if (CONSP (val))
7350 return val;
7351 if (! SYMBOLP (val))
7352 return Qnil;
7353 if (! NILP (Fcoding_system_p (val)))
7354 return Fcons (val, val);
7355 if (! NILP (Ffboundp (val)))
7356 {
7357 val = call1 (val, Flist (nargs, args));
7358 if (CONSP (val))
7359 return val;
7360 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7361 return Fcons (val, val);
7362 }
7363 return Qnil;
7364 }
7365 }
7366 return Qnil;
7367 }
7368
7369 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
7370 Supdate_coding_systems_internal, 0, 0, 0,
7371 doc: /* Update internal database for ISO2022 and CCL based coding systems.
7372 When values of any coding categories are changed, you must
7373 call this function. */)
7374 ()
7375 {
7376 int i;
7377
7378 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7379 {
7380 Lisp_Object val;
7381
7382 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7383 if (!NILP (val))
7384 {
7385 if (! coding_system_table[i])
7386 coding_system_table[i] = ((struct coding_system *)
7387 xmalloc (sizeof (struct coding_system)));
7388 setup_coding_system (val, coding_system_table[i]);
7389 }
7390 else if (coding_system_table[i])
7391 {
7392 xfree (coding_system_table[i]);
7393 coding_system_table[i] = NULL;
7394 }
7395 }
7396
7397 return Qnil;
7398 }
7399
7400 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7401 Sset_coding_priority_internal, 0, 0, 0,
7402 doc: /* Update internal database for the current value of `coding-category-list'.
7403 This function is internal use only. */)
7404 ()
7405 {
7406 int i = 0, idx;
7407 Lisp_Object val;
7408
7409 val = Vcoding_category_list;
7410
7411 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7412 {
7413 if (! SYMBOLP (XCAR (val)))
7414 break;
7415 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7416 if (idx >= CODING_CATEGORY_IDX_MAX)
7417 break;
7418 coding_priorities[i++] = (1 << idx);
7419 val = XCDR (val);
7420 }
7421 /* If coding-category-list is valid and contains all coding
7422 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
7423 the following code saves Emacs from crashing. */
7424 while (i < CODING_CATEGORY_IDX_MAX)
7425 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7426
7427 return Qnil;
7428 }
7429
7430 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7431 Sdefine_coding_system_internal, 1, 1, 0,
7432 doc: /* Register CODING-SYSTEM as a base coding system.
7433 This function is internal use only. */)
7434 (coding_system)
7435 Lisp_Object coding_system;
7436 {
7437 Lisp_Object safe_chars, slot;
7438
7439 if (NILP (Fcheck_coding_system (coding_system)))
7440 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7441 safe_chars = coding_safe_chars (coding_system);
7442 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7443 error ("No valid safe-chars property for %s",
7444 SDATA (SYMBOL_NAME (coding_system)));
7445 if (EQ (safe_chars, Qt))
7446 {
7447 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7448 XSETCAR (Vcoding_system_safe_chars,
7449 Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7450 }
7451 else
7452 {
7453 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7454 if (NILP (slot))
7455 XSETCDR (Vcoding_system_safe_chars,
7456 nconc2 (XCDR (Vcoding_system_safe_chars),
7457 Fcons (Fcons (coding_system, safe_chars), Qnil)));
7458 else
7459 XSETCDR (slot, safe_chars);
7460 }
7461 return Qnil;
7462 }
7463
7464 #endif /* emacs */
7465
7466 \f
7467 /*** 9. Post-amble ***/
7468
7469 void
7470 init_coding_once ()
7471 {
7472 int i;
7473
7474 /* Emacs' internal format specific initialize routine. */
7475 for (i = 0; i <= 0x20; i++)
7476 emacs_code_class[i] = EMACS_control_code;
7477 emacs_code_class[0x0A] = EMACS_linefeed_code;
7478 emacs_code_class[0x0D] = EMACS_carriage_return_code;
7479 for (i = 0x21 ; i < 0x7F; i++)
7480 emacs_code_class[i] = EMACS_ascii_code;
7481 emacs_code_class[0x7F] = EMACS_control_code;
7482 for (i = 0x80; i < 0xFF; i++)
7483 emacs_code_class[i] = EMACS_invalid_code;
7484 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7485 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7486 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7487 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7488
7489 /* ISO2022 specific initialize routine. */
7490 for (i = 0; i < 0x20; i++)
7491 iso_code_class[i] = ISO_control_0;
7492 for (i = 0x21; i < 0x7F; i++)
7493 iso_code_class[i] = ISO_graphic_plane_0;
7494 for (i = 0x80; i < 0xA0; i++)
7495 iso_code_class[i] = ISO_control_1;
7496 for (i = 0xA1; i < 0xFF; i++)
7497 iso_code_class[i] = ISO_graphic_plane_1;
7498 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7499 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7500 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7501 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7502 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7503 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7504 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7505 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7506 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7507 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7508
7509 setup_coding_system (Qnil, &safe_terminal_coding);
7510 setup_coding_system (Qnil, &default_buffer_file_coding);
7511
7512 bzero (coding_system_table, sizeof coding_system_table);
7513
7514 bzero (ascii_skip_code, sizeof ascii_skip_code);
7515 for (i = 0; i < 128; i++)
7516 ascii_skip_code[i] = 1;
7517
7518 #if defined (MSDOS) || defined (WINDOWSNT)
7519 system_eol_type = CODING_EOL_CRLF;
7520 #else
7521 system_eol_type = CODING_EOL_LF;
7522 #endif
7523
7524 inhibit_pre_post_conversion = 0;
7525 }
7526
7527 #ifdef emacs
7528
7529 void
7530 syms_of_coding ()
7531 {
7532 Qtarget_idx = intern ("target-idx");
7533 staticpro (&Qtarget_idx);
7534
7535 Qcoding_system_history = intern ("coding-system-history");
7536 staticpro (&Qcoding_system_history);
7537 Fset (Qcoding_system_history, Qnil);
7538
7539 /* Target FILENAME is the first argument. */
7540 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7541 /* Target FILENAME is the third argument. */
7542 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7543
7544 Qcall_process = intern ("call-process");
7545 staticpro (&Qcall_process);
7546 /* Target PROGRAM is the first argument. */
7547 Fput (Qcall_process, Qtarget_idx, make_number (0));
7548
7549 Qcall_process_region = intern ("call-process-region");
7550 staticpro (&Qcall_process_region);
7551 /* Target PROGRAM is the third argument. */
7552 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7553
7554 Qstart_process = intern ("start-process");
7555 staticpro (&Qstart_process);
7556 /* Target PROGRAM is the third argument. */
7557 Fput (Qstart_process, Qtarget_idx, make_number (2));
7558
7559 Qopen_network_stream = intern ("open-network-stream");
7560 staticpro (&Qopen_network_stream);
7561 /* Target SERVICE is the fourth argument. */
7562 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7563
7564 Qcoding_system = intern ("coding-system");
7565 staticpro (&Qcoding_system);
7566
7567 Qeol_type = intern ("eol-type");
7568 staticpro (&Qeol_type);
7569
7570 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7571 staticpro (&Qbuffer_file_coding_system);
7572
7573 Qpost_read_conversion = intern ("post-read-conversion");
7574 staticpro (&Qpost_read_conversion);
7575
7576 Qpre_write_conversion = intern ("pre-write-conversion");
7577 staticpro (&Qpre_write_conversion);
7578
7579 Qno_conversion = intern ("no-conversion");
7580 staticpro (&Qno_conversion);
7581
7582 Qundecided = intern ("undecided");
7583 staticpro (&Qundecided);
7584
7585 Qcoding_system_p = intern ("coding-system-p");
7586 staticpro (&Qcoding_system_p);
7587
7588 Qcoding_system_error = intern ("coding-system-error");
7589 staticpro (&Qcoding_system_error);
7590
7591 Fput (Qcoding_system_error, Qerror_conditions,
7592 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7593 Fput (Qcoding_system_error, Qerror_message,
7594 build_string ("Invalid coding system"));
7595
7596 Qcoding_category = intern ("coding-category");
7597 staticpro (&Qcoding_category);
7598 Qcoding_category_index = intern ("coding-category-index");
7599 staticpro (&Qcoding_category_index);
7600
7601 Vcoding_category_table
7602 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7603 staticpro (&Vcoding_category_table);
7604 {
7605 int i;
7606 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7607 {
7608 XVECTOR (Vcoding_category_table)->contents[i]
7609 = intern (coding_category_name[i]);
7610 Fput (XVECTOR (Vcoding_category_table)->contents[i],
7611 Qcoding_category_index, make_number (i));
7612 }
7613 }
7614
7615 Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7616 staticpro (&Vcoding_system_safe_chars);
7617
7618 Qtranslation_table = intern ("translation-table");
7619 staticpro (&Qtranslation_table);
7620 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7621
7622 Qtranslation_table_id = intern ("translation-table-id");
7623 staticpro (&Qtranslation_table_id);
7624
7625 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7626 staticpro (&Qtranslation_table_for_decode);
7627
7628 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7629 staticpro (&Qtranslation_table_for_encode);
7630
7631 Qsafe_chars = intern ("safe-chars");
7632 staticpro (&Qsafe_chars);
7633
7634 Qchar_coding_system = intern ("char-coding-system");
7635 staticpro (&Qchar_coding_system);
7636
7637 /* Intern this now in case it isn't already done.
7638 Setting this variable twice is harmless.
7639 But don't staticpro it here--that is done in alloc.c. */
7640 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7641 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7642 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7643
7644 Qvalid_codes = intern ("valid-codes");
7645 staticpro (&Qvalid_codes);
7646
7647 Qemacs_mule = intern ("emacs-mule");
7648 staticpro (&Qemacs_mule);
7649
7650 Qraw_text = intern ("raw-text");
7651 staticpro (&Qraw_text);
7652
7653 Qutf_8 = intern ("utf-8");
7654 staticpro (&Qutf_8);
7655
7656 Qcoding_system_define_form = intern ("coding-system-define-form");
7657 staticpro (&Qcoding_system_define_form);
7658
7659 defsubr (&Scoding_system_p);
7660 defsubr (&Sread_coding_system);
7661 defsubr (&Sread_non_nil_coding_system);
7662 defsubr (&Scheck_coding_system);
7663 defsubr (&Sdetect_coding_region);
7664 defsubr (&Sdetect_coding_string);
7665 defsubr (&Sfind_coding_systems_region_internal);
7666 defsubr (&Sunencodable_char_position);
7667 defsubr (&Sdecode_coding_region);
7668 defsubr (&Sencode_coding_region);
7669 defsubr (&Sdecode_coding_string);
7670 defsubr (&Sencode_coding_string);
7671 defsubr (&Sdecode_sjis_char);
7672 defsubr (&Sencode_sjis_char);
7673 defsubr (&Sdecode_big5_char);
7674 defsubr (&Sencode_big5_char);
7675 defsubr (&Sset_terminal_coding_system_internal);
7676 defsubr (&Sset_safe_terminal_coding_system_internal);
7677 defsubr (&Sterminal_coding_system);
7678 defsubr (&Sset_keyboard_coding_system_internal);
7679 defsubr (&Skeyboard_coding_system);
7680 defsubr (&Sfind_operation_coding_system);
7681 defsubr (&Supdate_coding_systems_internal);
7682 defsubr (&Sset_coding_priority_internal);
7683 defsubr (&Sdefine_coding_system_internal);
7684
7685 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7686 doc: /* List of coding systems.
7687
7688 Do not alter the value of this variable manually. This variable should be
7689 updated by the functions `make-coding-system' and
7690 `define-coding-system-alias'. */);
7691 Vcoding_system_list = Qnil;
7692
7693 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7694 doc: /* Alist of coding system names.
7695 Each element is one element list of coding system name.
7696 This variable is given to `completing-read' as TABLE argument.
7697
7698 Do not alter the value of this variable manually. This variable should be
7699 updated by the functions `make-coding-system' and
7700 `define-coding-system-alias'. */);
7701 Vcoding_system_alist = Qnil;
7702
7703 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7704 doc: /* List of coding-categories (symbols) ordered by priority.
7705
7706 On detecting a coding system, Emacs tries code detection algorithms
7707 associated with each coding-category one by one in this order. When
7708 one algorithm agrees with a byte sequence of source text, the coding
7709 system bound to the corresponding coding-category is selected. */);
7710 {
7711 int i;
7712
7713 Vcoding_category_list = Qnil;
7714 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7715 Vcoding_category_list
7716 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7717 Vcoding_category_list);
7718 }
7719
7720 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7721 doc: /* Specify the coding system for read operations.
7722 It is useful to bind this variable with `let', but do not set it globally.
7723 If the value is a coding system, it is used for decoding on read operation.
7724 If not, an appropriate element is used from one of the coding system alists:
7725 There are three such tables, `file-coding-system-alist',
7726 `process-coding-system-alist', and `network-coding-system-alist'. */);
7727 Vcoding_system_for_read = Qnil;
7728
7729 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7730 doc: /* Specify the coding system for write operations.
7731 Programs bind this variable with `let', but you should not set it globally.
7732 If the value is a coding system, it is used for encoding of output,
7733 when writing it to a file and when sending it to a file or subprocess.
7734
7735 If this does not specify a coding system, an appropriate element
7736 is used from one of the coding system alists:
7737 There are three such tables, `file-coding-system-alist',
7738 `process-coding-system-alist', and `network-coding-system-alist'.
7739 For output to files, if the above procedure does not specify a coding system,
7740 the value of `buffer-file-coding-system' is used. */);
7741 Vcoding_system_for_write = Qnil;
7742
7743 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7744 doc: /* Coding system used in the latest file or process I/O.
7745 Also set by `encode-coding-region', `decode-coding-region',
7746 `encode-coding-string' and `decode-coding-string'. */);
7747 Vlast_coding_system_used = Qnil;
7748
7749 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7750 doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7751 See info node `Coding Systems' and info node `Text and Binary' concerning
7752 such conversion. */);
7753 inhibit_eol_conversion = 0;
7754
7755 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7756 doc: /* Non-nil means process buffer inherits coding system of process output.
7757 Bind it to t if the process output is to be treated as if it were a file
7758 read from some filesystem. */);
7759 inherit_process_coding_system = 0;
7760
7761 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7762 doc: /* Alist to decide a coding system to use for a file I/O operation.
7763 The format is ((PATTERN . VAL) ...),
7764 where PATTERN is a regular expression matching a file name,
7765 VAL is a coding system, a cons of coding systems, or a function symbol.
7766 If VAL is a coding system, it is used for both decoding and encoding
7767 the file contents.
7768 If VAL is a cons of coding systems, the car part is used for decoding,
7769 and the cdr part is used for encoding.
7770 If VAL is a function symbol, the function must return a coding system
7771 or a cons of coding systems which are used as above. The function gets
7772 the arguments with which `find-operation-coding-system' was called.
7773
7774 See also the function `find-operation-coding-system'
7775 and the variable `auto-coding-alist'. */);
7776 Vfile_coding_system_alist = Qnil;
7777
7778 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7779 doc: /* Alist to decide a coding system to use for a process I/O operation.
7780 The format is ((PATTERN . VAL) ...),
7781 where PATTERN is a regular expression matching a program name,
7782 VAL is a coding system, a cons of coding systems, or a function symbol.
7783 If VAL is a coding system, it is used for both decoding what received
7784 from the program and encoding what sent to the program.
7785 If VAL is a cons of coding systems, the car part is used for decoding,
7786 and the cdr part is used for encoding.
7787 If VAL is a function symbol, the function must return a coding system
7788 or a cons of coding systems which are used as above.
7789
7790 See also the function `find-operation-coding-system'. */);
7791 Vprocess_coding_system_alist = Qnil;
7792
7793 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7794 doc: /* Alist to decide a coding system to use for a network I/O operation.
7795 The format is ((PATTERN . VAL) ...),
7796 where PATTERN is a regular expression matching a network service name
7797 or is a port number to connect to,
7798 VAL is a coding system, a cons of coding systems, or a function symbol.
7799 If VAL is a coding system, it is used for both decoding what received
7800 from the network stream and encoding what sent to the network stream.
7801 If VAL is a cons of coding systems, the car part is used for decoding,
7802 and the cdr part is used for encoding.
7803 If VAL is a function symbol, the function must return a coding system
7804 or a cons of coding systems which are used as above.
7805
7806 See also the function `find-operation-coding-system'. */);
7807 Vnetwork_coding_system_alist = Qnil;
7808
7809 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7810 doc: /* Coding system to use with system messages.
7811 Also used for decoding keyboard input on X Window system. */);
7812 Vlocale_coding_system = Qnil;
7813
7814 /* The eol mnemonics are reset in startup.el system-dependently. */
7815 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7816 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7817 eol_mnemonic_unix = build_string (":");
7818
7819 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7820 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7821 eol_mnemonic_dos = build_string ("\\");
7822
7823 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7824 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */);
7825 eol_mnemonic_mac = build_string ("/");
7826
7827 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7828 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */);
7829 eol_mnemonic_undecided = build_string (":");
7830
7831 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7832 doc: /* *Non-nil enables character translation while encoding and decoding. */);
7833 Venable_character_translation = Qt;
7834
7835 DEFVAR_LISP ("standard-translation-table-for-decode",
7836 &Vstandard_translation_table_for_decode,
7837 doc: /* Table for translating characters while decoding. */);
7838 Vstandard_translation_table_for_decode = Qnil;
7839
7840 DEFVAR_LISP ("standard-translation-table-for-encode",
7841 &Vstandard_translation_table_for_encode,
7842 doc: /* Table for translating characters while encoding. */);
7843 Vstandard_translation_table_for_encode = Qnil;
7844
7845 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7846 doc: /* Alist of charsets vs revision numbers.
7847 While encoding, if a charset (car part of an element) is found,
7848 designate it with the escape sequence identifying revision (cdr part of the element). */);
7849 Vcharset_revision_alist = Qnil;
7850
7851 DEFVAR_LISP ("default-process-coding-system",
7852 &Vdefault_process_coding_system,
7853 doc: /* Cons of coding systems used for process I/O by default.
7854 The car part is used for decoding a process output,
7855 the cdr part is used for encoding a text to be sent to a process. */);
7856 Vdefault_process_coding_system = Qnil;
7857
7858 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7859 doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7860 This is a vector of length 256.
7861 If Nth element is non-nil, the existence of code N in a file
7862 \(or output of subprocess) doesn't prevent it to be detected as
7863 a coding system of ISO 2022 variant which has a flag
7864 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7865 or reading output of a subprocess.
7866 Only 128th through 159th elements has a meaning. */);
7867 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7868
7869 DEFVAR_LISP ("select-safe-coding-system-function",
7870 &Vselect_safe_coding_system_function,
7871 doc: /* Function to call to select safe coding system for encoding a text.
7872
7873 If set, this function is called to force a user to select a proper
7874 coding system which can encode the text in the case that a default
7875 coding system used in each operation can't encode the text.
7876
7877 The default value is `select-safe-coding-system' (which see). */);
7878 Vselect_safe_coding_system_function = Qnil;
7879
7880 DEFVAR_BOOL ("coding-system-require-warning",
7881 &coding_system_require_warning,
7882 doc: /* Internal use only.
7883 If non-nil, on writing a file, `select-safe-coding-system-function' is
7884 called even if `coding-system-for-write' is non-nil. The command
7885 `universal-coding-system-argument' binds this variable to t temporarily. */);
7886 coding_system_require_warning = 0;
7887
7888
7889 DEFVAR_BOOL ("inhibit-iso-escape-detection",
7890 &inhibit_iso_escape_detection,
7891 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7892
7893 By default, on reading a file, Emacs tries to detect how the text is
7894 encoded. This code detection is sensitive to escape sequences. If
7895 the sequence is valid as ISO2022, the code is determined as one of
7896 the ISO2022 encodings, and the file is decoded by the corresponding
7897 coding system (e.g. `iso-2022-7bit').
7898
7899 However, there may be a case that you want to read escape sequences in
7900 a file as is. In such a case, you can set this variable to non-nil.
7901 Then, as the code detection ignores any escape sequences, no file is
7902 detected as encoded in some ISO2022 encoding. The result is that all
7903 escape sequences become visible in a buffer.
7904
7905 The default value is nil, and it is strongly recommended not to change
7906 it. That is because many Emacs Lisp source files that contain
7907 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7908 in Emacs's distribution, and they won't be decoded correctly on
7909 reading if you suppress escape sequence detection.
7910
7911 The other way to read escape sequences in a file without decoding is
7912 to explicitly specify some coding system that doesn't use ISO2022's
7913 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
7914 inhibit_iso_escape_detection = 0;
7915
7916 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7917 doc: /* Char table for translating self-inserting characters.
7918 This is applied to the result of input methods, not their input. See also
7919 `keyboard-translate-table'. */);
7920 Vtranslation_table_for_input = Qnil;
7921 }
7922
7923 char *
7924 emacs_strerror (error_number)
7925 int error_number;
7926 {
7927 char *str;
7928
7929 synchronize_system_messages_locale ();
7930 str = strerror (error_number);
7931
7932 if (! NILP (Vlocale_coding_system))
7933 {
7934 Lisp_Object dec = code_convert_string_norecord (build_string (str),
7935 Vlocale_coding_system,
7936 0);
7937 str = (char *) SDATA (dec);
7938 }
7939
7940 return str;
7941 }
7942
7943 #endif /* emacs */
7944
7945 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
7946 (do not change this comment) */