]> code.delx.au - gnu-emacs/blob - src/coding.c
merge trunk
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return true if the byte sequence conforms to XXX.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
158 {
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
165
166 while (1)
167 {
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
176 }
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
180
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
185 }
186 #endif
187
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
194
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
199
200 Below is the template of these functions. */
201
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
248
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
253
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
257
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
280 }
281 #endif
282
283 \f
284 /*** 1. Preamble ***/
285
286 #include <config.h>
287 #include <stdio.h>
288
289 #ifdef HAVE_WCHAR_H
290 #include <wchar.h>
291 #endif /* HAVE_WCHAR_H */
292
293 #include "lisp.h"
294 #include "character.h"
295 #include "buffer.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
303
304 Lisp_Object Vcoding_system_hash_table;
305
306 static Lisp_Object Qcoding_system, Qeol_type;
307 static Lisp_Object Qcoding_aliases;
308 Lisp_Object Qunix, Qdos;
309 static Lisp_Object Qmac;
310 Lisp_Object Qbuffer_file_coding_system;
311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
312 static Lisp_Object Qdefault_char;
313 Lisp_Object Qno_conversion, Qundecided;
314 Lisp_Object Qcharset, Qutf_8;
315 static Lisp_Object Qiso_2022;
316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
317 static Lisp_Object Qbig, Qlittle;
318 static Lisp_Object Qcoding_system_history;
319 static Lisp_Object Qvalid_codes;
320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
323 static Lisp_Object QCascii_compatible_p;
324
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 static Lisp_Object Qtarget_idx;
328
329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
330
331 /* If a symbol has this property, evaluate the value to define the
332 symbol as a coding system. */
333 static Lisp_Object Qcoding_system_define_form;
334
335 /* Format of end-of-line decided by system. This is Qunix on
336 Unix and Mac, Qdos on DOS/Windows.
337 This has an effect only for external encoding (i.e. for output to
338 file and process), not for in-buffer or Lisp string encoding. */
339 static Lisp_Object system_eol_type;
340
341 #ifdef emacs
342
343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
344
345 /* Coding system emacs-mule and raw-text are for converting only
346 end-of-line format. */
347 Lisp_Object Qemacs_mule, Qraw_text;
348 Lisp_Object Qutf_8_emacs;
349
350 #if defined (WINDOWSNT) || defined (CYGWIN)
351 static Lisp_Object Qutf_16le;
352 #endif
353
354 /* Coding-systems are handed between Emacs Lisp programs and C internal
355 routines by the following three variables. */
356 /* Coding system to be used to encode text for terminal display when
357 terminal coding system is nil. */
358 struct coding_system safe_terminal_coding;
359
360 #endif /* emacs */
361
362 Lisp_Object Qtranslation_table;
363 Lisp_Object Qtranslation_table_id;
364 static Lisp_Object Qtranslation_table_for_decode;
365 static Lisp_Object Qtranslation_table_for_encode;
366
367 /* Two special coding systems. */
368 static Lisp_Object Vsjis_coding_system;
369 static Lisp_Object Vbig5_coding_system;
370
371 /* ISO2022 section */
372
373 #define CODING_ISO_INITIAL(coding, reg) \
374 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
375 coding_attr_iso_initial), \
376 reg)))
377
378
379 #define CODING_ISO_REQUEST(coding, charset_id) \
380 (((charset_id) <= (coding)->max_charset_id \
381 ? ((coding)->safe_charsets[charset_id] != 255 \
382 ? (coding)->safe_charsets[charset_id] \
383 : -1) \
384 : -1))
385
386
387 #define CODING_ISO_FLAGS(coding) \
388 ((coding)->spec.iso_2022.flags)
389 #define CODING_ISO_DESIGNATION(coding, reg) \
390 ((coding)->spec.iso_2022.current_designation[reg])
391 #define CODING_ISO_INVOCATION(coding, plane) \
392 ((coding)->spec.iso_2022.current_invocation[plane])
393 #define CODING_ISO_SINGLE_SHIFTING(coding) \
394 ((coding)->spec.iso_2022.single_shifting)
395 #define CODING_ISO_BOL(coding) \
396 ((coding)->spec.iso_2022.bol)
397 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
398 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
399 #define CODING_ISO_CMP_STATUS(coding) \
400 (&(coding)->spec.iso_2022.cmp_status)
401 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
402 ((coding)->spec.iso_2022.ctext_extended_segment_len)
403 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
404 ((coding)->spec.iso_2022.embedded_utf_8)
405
406 /* Control characters of ISO2022. */
407 /* code */ /* function */
408 #define ISO_CODE_SO 0x0E /* shift-out */
409 #define ISO_CODE_SI 0x0F /* shift-in */
410 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
411 #define ISO_CODE_ESC 0x1B /* escape */
412 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
413 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
414 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
415
416 /* All code (1-byte) of ISO2022 is classified into one of the
417 followings. */
418 enum iso_code_class_type
419 {
420 ISO_control_0, /* Control codes in the range
421 0x00..0x1F and 0x7F, except for the
422 following 5 codes. */
423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
427 ISO_control_1, /* Control codes in the range
428 0x80..0x9F, except for the
429 following 3 codes. */
430 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
431 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
432 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
433 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
434 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
435 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
436 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
437 };
438
439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
440 `iso-flags' attribute of an iso2022 coding system. */
441
442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
443 instead of the correct short-form sequence (e.g. ESC $ A). */
444 #define CODING_ISO_FLAG_LONG_FORM 0x0001
445
446 /* If set, reset graphic planes and registers at end-of-line to the
447 initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
449
450 /* If set, reset graphic planes and registers before any control
451 characters to the initial state. */
452 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
453
454 /* If set, encode by 7-bit environment. */
455 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
456
457 /* If set, use locking-shift function. */
458 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
459
460 /* If set, use single-shift function. Overwrite
461 CODING_ISO_FLAG_LOCKING_SHIFT. */
462 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
463
464 /* If set, use designation escape sequence. */
465 #define CODING_ISO_FLAG_DESIGNATION 0x0040
466
467 /* If set, produce revision number sequence. */
468 #define CODING_ISO_FLAG_REVISION 0x0080
469
470 /* If set, produce ISO6429's direction specifying sequence. */
471 #define CODING_ISO_FLAG_DIRECTION 0x0100
472
473 /* If set, assume designation states are reset at beginning of line on
474 output. */
475 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
476
477 /* If set, designation sequence should be placed at beginning of line
478 on output. */
479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
480
481 /* If set, do not encode unsafe characters on output. */
482 #define CODING_ISO_FLAG_SAFE 0x0800
483
484 /* If set, extra latin codes (128..159) are accepted as a valid code
485 on input. */
486 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
487
488 #define CODING_ISO_FLAG_COMPOSITION 0x2000
489
490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
491
492 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
493
494 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
495
496 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
497
498 /* A character to be produced on output if encoding of the original
499 character is prohibited by CODING_ISO_FLAG_SAFE. */
500 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
501
502 /* UTF-8 section */
503 #define CODING_UTF_8_BOM(coding) \
504 ((coding)->spec.utf_8_bom)
505
506 /* UTF-16 section */
507 #define CODING_UTF_16_BOM(coding) \
508 ((coding)->spec.utf_16.bom)
509
510 #define CODING_UTF_16_ENDIAN(coding) \
511 ((coding)->spec.utf_16.endian)
512
513 #define CODING_UTF_16_SURROGATE(coding) \
514 ((coding)->spec.utf_16.surrogate)
515
516
517 /* CCL section */
518 #define CODING_CCL_DECODER(coding) \
519 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
520 #define CODING_CCL_ENCODER(coding) \
521 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
522 #define CODING_CCL_VALIDS(coding) \
523 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
524
525 /* Index for each coding category in `coding_categories' */
526
527 enum coding_category
528 {
529 coding_category_iso_7,
530 coding_category_iso_7_tight,
531 coding_category_iso_8_1,
532 coding_category_iso_8_2,
533 coding_category_iso_7_else,
534 coding_category_iso_8_else,
535 coding_category_utf_8_auto,
536 coding_category_utf_8_nosig,
537 coding_category_utf_8_sig,
538 coding_category_utf_16_auto,
539 coding_category_utf_16_be,
540 coding_category_utf_16_le,
541 coding_category_utf_16_be_nosig,
542 coding_category_utf_16_le_nosig,
543 coding_category_charset,
544 coding_category_sjis,
545 coding_category_big5,
546 coding_category_ccl,
547 coding_category_emacs_mule,
548 /* All above are targets of code detection. */
549 coding_category_raw_text,
550 coding_category_undecided,
551 coding_category_max
552 };
553
554 /* Definitions of flag bits used in detect_coding_XXXX. */
555 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
556 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
557 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
558 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
559 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
560 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
561 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
562 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
563 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
564 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
565 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
566 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
567 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
568 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
569 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
570 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
571 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
572 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
573 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
574 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
575
576 /* This value is returned if detect_coding_mask () find nothing other
577 than ASCII characters. */
578 #define CATEGORY_MASK_ANY \
579 (CATEGORY_MASK_ISO_7 \
580 | CATEGORY_MASK_ISO_7_TIGHT \
581 | CATEGORY_MASK_ISO_8_1 \
582 | CATEGORY_MASK_ISO_8_2 \
583 | CATEGORY_MASK_ISO_7_ELSE \
584 | CATEGORY_MASK_ISO_8_ELSE \
585 | CATEGORY_MASK_UTF_8_AUTO \
586 | CATEGORY_MASK_UTF_8_NOSIG \
587 | CATEGORY_MASK_UTF_8_SIG \
588 | CATEGORY_MASK_UTF_16_AUTO \
589 | CATEGORY_MASK_UTF_16_BE \
590 | CATEGORY_MASK_UTF_16_LE \
591 | CATEGORY_MASK_UTF_16_BE_NOSIG \
592 | CATEGORY_MASK_UTF_16_LE_NOSIG \
593 | CATEGORY_MASK_CHARSET \
594 | CATEGORY_MASK_SJIS \
595 | CATEGORY_MASK_BIG5 \
596 | CATEGORY_MASK_CCL \
597 | CATEGORY_MASK_EMACS_MULE)
598
599
600 #define CATEGORY_MASK_ISO_7BIT \
601 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
602
603 #define CATEGORY_MASK_ISO_8BIT \
604 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
605
606 #define CATEGORY_MASK_ISO_ELSE \
607 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
608
609 #define CATEGORY_MASK_ISO_ESCAPE \
610 (CATEGORY_MASK_ISO_7 \
611 | CATEGORY_MASK_ISO_7_TIGHT \
612 | CATEGORY_MASK_ISO_7_ELSE \
613 | CATEGORY_MASK_ISO_8_ELSE)
614
615 #define CATEGORY_MASK_ISO \
616 ( CATEGORY_MASK_ISO_7BIT \
617 | CATEGORY_MASK_ISO_8BIT \
618 | CATEGORY_MASK_ISO_ELSE)
619
620 #define CATEGORY_MASK_UTF_16 \
621 (CATEGORY_MASK_UTF_16_AUTO \
622 | CATEGORY_MASK_UTF_16_BE \
623 | CATEGORY_MASK_UTF_16_LE \
624 | CATEGORY_MASK_UTF_16_BE_NOSIG \
625 | CATEGORY_MASK_UTF_16_LE_NOSIG)
626
627 #define CATEGORY_MASK_UTF_8 \
628 (CATEGORY_MASK_UTF_8_AUTO \
629 | CATEGORY_MASK_UTF_8_NOSIG \
630 | CATEGORY_MASK_UTF_8_SIG)
631
632 /* Table of coding categories (Lisp symbols). This variable is for
633 internal use only. */
634 static Lisp_Object Vcoding_category_table;
635
636 /* Table of coding-categories ordered by priority. */
637 static enum coding_category coding_priorities[coding_category_max];
638
639 /* Nth element is a coding context for the coding system bound to the
640 Nth coding category. */
641 static struct coding_system coding_categories[coding_category_max];
642
643 /*** Commonly used macros and functions ***/
644
645 #ifndef min
646 #define min(a, b) ((a) < (b) ? (a) : (b))
647 #endif
648 #ifndef max
649 #define max(a, b) ((a) > (b) ? (a) : (b))
650 #endif
651
652 #define CODING_GET_INFO(coding, attrs, charset_list) \
653 do { \
654 (attrs) = CODING_ID_ATTRS ((coding)->id); \
655 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
656 } while (0)
657
658 static void
659 CHECK_NATNUM_CAR (Lisp_Object x)
660 {
661 Lisp_Object tmp = XCAR (x);
662 CHECK_NATNUM (tmp);
663 XSETCAR (x, tmp);
664 }
665
666 static void
667 CHECK_NATNUM_CDR (Lisp_Object x)
668 {
669 Lisp_Object tmp = XCDR (x);
670 CHECK_NATNUM (tmp);
671 XSETCDR (x, tmp);
672 }
673
674
675 /* Safely get one byte from the source text pointed by SRC which ends
676 at SRC_END, and set C to that byte. If there are not enough bytes
677 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
678 and a multibyte character is found at SRC, set C to the
679 negative value of the character code. The caller should declare
680 and set these variables appropriately in advance:
681 src, src_end, multibytep */
682
683 #define ONE_MORE_BYTE(c) \
684 do { \
685 if (src == src_end) \
686 { \
687 if (src_base < src) \
688 record_conversion_result \
689 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
690 goto no_more_source; \
691 } \
692 c = *src++; \
693 if (multibytep && (c & 0x80)) \
694 { \
695 if ((c & 0xFE) == 0xC0) \
696 c = ((c & 1) << 6) | *src++; \
697 else \
698 { \
699 src--; \
700 c = - string_char (src, &src, NULL); \
701 record_conversion_result \
702 (coding, CODING_RESULT_INVALID_SRC); \
703 } \
704 } \
705 consumed_chars++; \
706 } while (0)
707
708 /* Safely get two bytes from the source text pointed by SRC which ends
709 at SRC_END, and set C1 and C2 to those bytes while skipping the
710 heading multibyte characters. If there are not enough bytes in the
711 source, it jumps to 'no_more_source'. If MULTIBYTEP and
712 a multibyte character is found for C2, set C2 to the negative value
713 of the character code. The caller should declare and set these
714 variables appropriately in advance:
715 src, src_end, multibytep
716 It is intended that this macro is used in detect_coding_utf_16. */
717
718 #define TWO_MORE_BYTES(c1, c2) \
719 do { \
720 do { \
721 if (src == src_end) \
722 goto no_more_source; \
723 c1 = *src++; \
724 if (multibytep && (c1 & 0x80)) \
725 { \
726 if ((c1 & 0xFE) == 0xC0) \
727 c1 = ((c1 & 1) << 6) | *src++; \
728 else \
729 { \
730 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
731 c1 = -1; \
732 } \
733 } \
734 } while (c1 < 0); \
735 if (src == src_end) \
736 goto no_more_source; \
737 c2 = *src++; \
738 if (multibytep && (c2 & 0x80)) \
739 { \
740 if ((c2 & 0xFE) == 0xC0) \
741 c2 = ((c2 & 1) << 6) | *src++; \
742 else \
743 c2 = -1; \
744 } \
745 } while (0)
746
747
748 /* Store a byte C in the place pointed by DST and increment DST to the
749 next free point, and increment PRODUCED_CHARS. The caller should
750 assure that C is 0..127, and declare and set the variable `dst'
751 appropriately in advance.
752 */
753
754
755 #define EMIT_ONE_ASCII_BYTE(c) \
756 do { \
757 produced_chars++; \
758 *dst++ = (c); \
759 } while (0)
760
761
762 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
763
764 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
765 do { \
766 produced_chars += 2; \
767 *dst++ = (c1), *dst++ = (c2); \
768 } while (0)
769
770
771 /* Store a byte C in the place pointed by DST and increment DST to the
772 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
773 store in an appropriate multibyte form. The caller should
774 declare and set the variables `dst' and `multibytep' appropriately
775 in advance. */
776
777 #define EMIT_ONE_BYTE(c) \
778 do { \
779 produced_chars++; \
780 if (multibytep) \
781 { \
782 unsigned ch = (c); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 } \
787 else \
788 *dst++ = (c); \
789 } while (0)
790
791
792 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
793
794 #define EMIT_TWO_BYTES(c1, c2) \
795 do { \
796 produced_chars += 2; \
797 if (multibytep) \
798 { \
799 unsigned ch; \
800 \
801 ch = (c1); \
802 if (ch >= 0x80) \
803 ch = BYTE8_TO_CHAR (ch); \
804 CHAR_STRING_ADVANCE (ch, dst); \
805 ch = (c2); \
806 if (ch >= 0x80) \
807 ch = BYTE8_TO_CHAR (ch); \
808 CHAR_STRING_ADVANCE (ch, dst); \
809 } \
810 else \
811 { \
812 *dst++ = (c1); \
813 *dst++ = (c2); \
814 } \
815 } while (0)
816
817
818 #define EMIT_THREE_BYTES(c1, c2, c3) \
819 do { \
820 EMIT_ONE_BYTE (c1); \
821 EMIT_TWO_BYTES (c2, c3); \
822 } while (0)
823
824
825 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
826 do { \
827 EMIT_TWO_BYTES (c1, c2); \
828 EMIT_TWO_BYTES (c3, c4); \
829 } while (0)
830
831
832 static void
833 record_conversion_result (struct coding_system *coding,
834 enum coding_result_code result)
835 {
836 coding->result = result;
837 switch (result)
838 {
839 case CODING_RESULT_INSUFFICIENT_SRC:
840 Vlast_code_conversion_error = Qinsufficient_source;
841 break;
842 case CODING_RESULT_INVALID_SRC:
843 Vlast_code_conversion_error = Qinvalid_source;
844 break;
845 case CODING_RESULT_INTERRUPT:
846 Vlast_code_conversion_error = Qinterrupted;
847 break;
848 case CODING_RESULT_INSUFFICIENT_DST:
849 /* Don't record this error in Vlast_code_conversion_error
850 because it happens just temporarily and is resolved when the
851 whole conversion is finished. */
852 break;
853 case CODING_RESULT_SUCCESS:
854 break;
855 default:
856 Vlast_code_conversion_error = intern ("Unknown error");
857 }
858 }
859
860 /* These wrapper macros are used to preserve validity of pointers into
861 buffer text across calls to decode_char, encode_char, etc, which
862 could cause relocation of buffers if it loads a charset map,
863 because loading a charset map allocates large structures. */
864
865 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
866 do { \
867 ptrdiff_t offset; \
868 \
869 charset_map_loaded = 0; \
870 c = DECODE_CHAR (charset, code); \
871 if (charset_map_loaded \
872 && (offset = coding_change_source (coding))) \
873 { \
874 src += offset; \
875 src_base += offset; \
876 src_end += offset; \
877 } \
878 } while (0)
879
880 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
881 do { \
882 ptrdiff_t offset; \
883 \
884 charset_map_loaded = 0; \
885 code = ENCODE_CHAR (charset, c); \
886 if (charset_map_loaded \
887 && (offset = coding_change_destination (coding))) \
888 { \
889 dst += offset; \
890 dst_end += offset; \
891 } \
892 } while (0)
893
894 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
895 do { \
896 ptrdiff_t offset; \
897 \
898 charset_map_loaded = 0; \
899 charset = char_charset (c, charset_list, code_return); \
900 if (charset_map_loaded \
901 && (offset = coding_change_destination (coding))) \
902 { \
903 dst += offset; \
904 dst_end += offset; \
905 } \
906 } while (0)
907
908 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
909 do { \
910 ptrdiff_t offset; \
911 \
912 charset_map_loaded = 0; \
913 result = CHAR_CHARSET_P (c, charset); \
914 if (charset_map_loaded \
915 && (offset = coding_change_destination (coding))) \
916 { \
917 dst += offset; \
918 dst_end += offset; \
919 } \
920 } while (0)
921
922
923 /* If there are at least BYTES length of room at dst, allocate memory
924 for coding->destination and update dst and dst_end. We don't have
925 to take care of coding->source which will be relocated. It is
926 handled by calling coding_set_source in encode_coding. */
927
928 #define ASSURE_DESTINATION(bytes) \
929 do { \
930 if (dst + (bytes) >= dst_end) \
931 { \
932 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
933 \
934 dst = alloc_destination (coding, more_bytes, dst); \
935 dst_end = coding->destination + coding->dst_bytes; \
936 } \
937 } while (0)
938
939
940 /* Store multibyte form of the character C in P, and advance P to the
941 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
942 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
943 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
944
945 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
946
947 /* Return the character code of character whose multibyte form is at
948 P, and advance P to the end of the multibyte form. This used to be
949 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
950 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
951
952 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
953
954 /* Set coding->source from coding->src_object. */
955
956 static void
957 coding_set_source (struct coding_system *coding)
958 {
959 if (BUFFERP (coding->src_object))
960 {
961 struct buffer *buf = XBUFFER (coding->src_object);
962
963 if (coding->src_pos < 0)
964 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
965 else
966 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
967 }
968 else if (STRINGP (coding->src_object))
969 {
970 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
971 }
972 else
973 {
974 /* Otherwise, the source is C string and is never relocated
975 automatically. Thus we don't have to update anything. */
976 }
977 }
978
979
980 /* Set coding->source from coding->src_object, and return how many
981 bytes coding->source was changed. */
982
983 static ptrdiff_t
984 coding_change_source (struct coding_system *coding)
985 {
986 const unsigned char *orig = coding->source;
987 coding_set_source (coding);
988 return coding->source - orig;
989 }
990
991
992 /* Set coding->destination from coding->dst_object. */
993
994 static void
995 coding_set_destination (struct coding_system *coding)
996 {
997 if (BUFFERP (coding->dst_object))
998 {
999 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1000 {
1001 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1002 coding->dst_bytes = (GAP_END_ADDR
1003 - (coding->src_bytes - coding->consumed)
1004 - coding->destination);
1005 }
1006 else
1007 {
1008 /* We are sure that coding->dst_pos_byte is before the gap
1009 of the buffer. */
1010 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1011 + coding->dst_pos_byte - BEG_BYTE);
1012 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1013 - coding->destination);
1014 }
1015 }
1016 else
1017 {
1018 /* Otherwise, the destination is C string and is never relocated
1019 automatically. Thus we don't have to update anything. */
1020 }
1021 }
1022
1023
1024 /* Set coding->destination from coding->dst_object, and return how
1025 many bytes coding->destination was changed. */
1026
1027 static ptrdiff_t
1028 coding_change_destination (struct coding_system *coding)
1029 {
1030 const unsigned char *orig = coding->destination;
1031 coding_set_destination (coding);
1032 return coding->destination - orig;
1033 }
1034
1035
1036 static void
1037 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1038 {
1039 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1040 string_overflow ();
1041 coding->destination = xrealloc (coding->destination,
1042 coding->dst_bytes + bytes);
1043 coding->dst_bytes += bytes;
1044 }
1045
1046 static void
1047 coding_alloc_by_making_gap (struct coding_system *coding,
1048 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1049 {
1050 if (EQ (coding->src_object, coding->dst_object))
1051 {
1052 /* The gap may contain the produced data at the head and not-yet
1053 consumed data at the tail. To preserve those data, we at
1054 first make the gap size to zero, then increase the gap
1055 size. */
1056 ptrdiff_t add = GAP_SIZE;
1057
1058 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1059 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1060 make_gap (bytes);
1061 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1062 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1063 }
1064 else
1065 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1066 }
1067
1068
1069 static unsigned char *
1070 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1071 unsigned char *dst)
1072 {
1073 ptrdiff_t offset = dst - coding->destination;
1074
1075 if (BUFFERP (coding->dst_object))
1076 {
1077 struct buffer *buf = XBUFFER (coding->dst_object);
1078
1079 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1080 }
1081 else
1082 coding_alloc_by_realloc (coding, nbytes);
1083 coding_set_destination (coding);
1084 dst = coding->destination + offset;
1085 return dst;
1086 }
1087
1088 /** Macros for annotations. */
1089
1090 /* An annotation data is stored in the array coding->charbuf in this
1091 format:
1092 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1093 LENGTH is the number of elements in the annotation.
1094 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1095 NCHARS is the number of characters in the text annotated.
1096
1097 The format of the following elements depend on ANNOTATION_MASK.
1098
1099 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1100 follows:
1101 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1102
1103 NBYTES is the number of bytes specified in the header part of
1104 old-style emacs-mule encoding, or 0 for the other kind of
1105 composition.
1106
1107 METHOD is one of enum composition_method.
1108
1109 Optional COMPOSITION-COMPONENTS are characters and composition
1110 rules.
1111
1112 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1113 follows.
1114
1115 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1116 recover from an invalid annotation, and should be skipped by
1117 produce_annotation. */
1118
1119 /* Maximum length of the header of annotation data. */
1120 #define MAX_ANNOTATION_LENGTH 5
1121
1122 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1123 do { \
1124 *(buf)++ = -(len); \
1125 *(buf)++ = (mask); \
1126 *(buf)++ = (nchars); \
1127 coding->annotated = 1; \
1128 } while (0);
1129
1130 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1131 do { \
1132 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1133 *buf++ = nbytes; \
1134 *buf++ = method; \
1135 } while (0)
1136
1137
1138 #define ADD_CHARSET_DATA(buf, nchars, id) \
1139 do { \
1140 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1141 *buf++ = id; \
1142 } while (0)
1143
1144
1145 /* Bitmasks for coding->eol_seen. */
1146
1147 #define EOL_SEEN_NONE 0
1148 #define EOL_SEEN_LF 1
1149 #define EOL_SEEN_CR 2
1150 #define EOL_SEEN_CRLF 4
1151
1152 \f
1153 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1154
1155
1156
1157 \f
1158 /*** 3. UTF-8 ***/
1159
1160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1161 Return true if a text is encoded in UTF-8. */
1162
1163 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1164 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1165 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1166 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1167 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1168 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1169
1170 #define UTF_8_BOM_1 0xEF
1171 #define UTF_8_BOM_2 0xBB
1172 #define UTF_8_BOM_3 0xBF
1173
1174 /* Unlike the other detect_coding_XXX, this function counts number of
1175 characters and check EOL format. */
1176
1177 static bool
1178 detect_coding_utf_8 (struct coding_system *coding,
1179 struct coding_detection_info *detect_info)
1180 {
1181 const unsigned char *src = coding->source, *src_base;
1182 const unsigned char *src_end = coding->source + coding->src_bytes;
1183 bool multibytep = coding->src_multibyte;
1184 ptrdiff_t consumed_chars = 0;
1185 bool bom_found = 0;
1186 int nchars = coding->head_ascii;
1187 int eol_seen = coding->eol_seen;
1188
1189 detect_info->checked |= CATEGORY_MASK_UTF_8;
1190 /* A coding system of this category is always ASCII compatible. */
1191 src += nchars;
1192
1193 if (src == coding->source /* BOM should be at the head. */
1194 && src + 3 < src_end /* BOM is 3-byte long. */
1195 && src[0] == UTF_8_BOM_1
1196 && src[1] == UTF_8_BOM_2
1197 && src[2] == UTF_8_BOM_3)
1198 {
1199 bom_found = 1;
1200 src += 3;
1201 nchars++;
1202 }
1203
1204 while (1)
1205 {
1206 int c, c1, c2, c3, c4;
1207
1208 src_base = src;
1209 ONE_MORE_BYTE (c);
1210 if (c < 0 || UTF_8_1_OCTET_P (c))
1211 {
1212 nchars++;
1213 if (c == '\r')
1214 {
1215 if (src < src_end && *src == '\n')
1216 {
1217 eol_seen |= EOL_SEEN_CRLF;
1218 src++;
1219 nchars++;
1220 }
1221 else
1222 eol_seen |= EOL_SEEN_CR;
1223 }
1224 else if (c == '\n')
1225 eol_seen |= EOL_SEEN_LF;
1226 continue;
1227 }
1228 ONE_MORE_BYTE (c1);
1229 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1230 break;
1231 if (UTF_8_2_OCTET_LEADING_P (c))
1232 {
1233 nchars++;
1234 continue;
1235 }
1236 ONE_MORE_BYTE (c2);
1237 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1238 break;
1239 if (UTF_8_3_OCTET_LEADING_P (c))
1240 {
1241 nchars++;
1242 continue;
1243 }
1244 ONE_MORE_BYTE (c3);
1245 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1246 break;
1247 if (UTF_8_4_OCTET_LEADING_P (c))
1248 {
1249 nchars++;
1250 continue;
1251 }
1252 ONE_MORE_BYTE (c4);
1253 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1254 break;
1255 if (UTF_8_5_OCTET_LEADING_P (c))
1256 {
1257 nchars++;
1258 continue;
1259 }
1260 break;
1261 }
1262 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1263 return 0;
1264
1265 no_more_source:
1266 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1267 {
1268 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1269 return 0;
1270 }
1271 if (bom_found)
1272 {
1273 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1274 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1275 }
1276 else
1277 {
1278 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1279 if (nchars < src_end - coding->source)
1280 /* The found characters are less than source bytes, which
1281 means that we found a valid non-ASCII characters. */
1282 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1283 }
1284 coding->detected_utf8_chars = nchars;
1285 return 1;
1286 }
1287
1288
1289 static void
1290 decode_coding_utf_8 (struct coding_system *coding)
1291 {
1292 const unsigned char *src = coding->source + coding->consumed;
1293 const unsigned char *src_end = coding->source + coding->src_bytes;
1294 const unsigned char *src_base;
1295 int *charbuf = coding->charbuf + coding->charbuf_used;
1296 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1298 bool multibytep = coding->src_multibyte;
1299 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1300 bool eol_dos
1301 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1302 int byte_after_cr = -1;
1303
1304 if (bom != utf_without_bom)
1305 {
1306 int c1, c2, c3;
1307
1308 src_base = src;
1309 ONE_MORE_BYTE (c1);
1310 if (! UTF_8_3_OCTET_LEADING_P (c1))
1311 src = src_base;
1312 else
1313 {
1314 ONE_MORE_BYTE (c2);
1315 if (! UTF_8_EXTRA_OCTET_P (c2))
1316 src = src_base;
1317 else
1318 {
1319 ONE_MORE_BYTE (c3);
1320 if (! UTF_8_EXTRA_OCTET_P (c3))
1321 src = src_base;
1322 else
1323 {
1324 if ((c1 != UTF_8_BOM_1)
1325 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1326 src = src_base;
1327 else
1328 CODING_UTF_8_BOM (coding) = utf_without_bom;
1329 }
1330 }
1331 }
1332 }
1333 CODING_UTF_8_BOM (coding) = utf_without_bom;
1334
1335 while (1)
1336 {
1337 int c, c1, c2, c3, c4, c5;
1338
1339 src_base = src;
1340 consumed_chars_base = consumed_chars;
1341
1342 if (charbuf >= charbuf_end)
1343 {
1344 if (byte_after_cr >= 0)
1345 src_base--;
1346 break;
1347 }
1348
1349 if (byte_after_cr >= 0)
1350 c1 = byte_after_cr, byte_after_cr = -1;
1351 else
1352 ONE_MORE_BYTE (c1);
1353 if (c1 < 0)
1354 {
1355 c = - c1;
1356 }
1357 else if (UTF_8_1_OCTET_P (c1))
1358 {
1359 if (eol_dos && c1 == '\r')
1360 ONE_MORE_BYTE (byte_after_cr);
1361 c = c1;
1362 }
1363 else
1364 {
1365 ONE_MORE_BYTE (c2);
1366 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1367 goto invalid_code;
1368 if (UTF_8_2_OCTET_LEADING_P (c1))
1369 {
1370 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1371 /* Reject overlong sequences here and below. Encoders
1372 producing them are incorrect, they can be misleading,
1373 and they mess up read/write invariance. */
1374 if (c < 128)
1375 goto invalid_code;
1376 }
1377 else
1378 {
1379 ONE_MORE_BYTE (c3);
1380 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1381 goto invalid_code;
1382 if (UTF_8_3_OCTET_LEADING_P (c1))
1383 {
1384 c = (((c1 & 0xF) << 12)
1385 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1386 if (c < 0x800
1387 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1388 goto invalid_code;
1389 }
1390 else
1391 {
1392 ONE_MORE_BYTE (c4);
1393 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1394 goto invalid_code;
1395 if (UTF_8_4_OCTET_LEADING_P (c1))
1396 {
1397 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1398 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1399 if (c < 0x10000)
1400 goto invalid_code;
1401 }
1402 else
1403 {
1404 ONE_MORE_BYTE (c5);
1405 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1406 goto invalid_code;
1407 if (UTF_8_5_OCTET_LEADING_P (c1))
1408 {
1409 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1410 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1411 | (c5 & 0x3F));
1412 if ((c > MAX_CHAR) || (c < 0x200000))
1413 goto invalid_code;
1414 }
1415 else
1416 goto invalid_code;
1417 }
1418 }
1419 }
1420 }
1421
1422 *charbuf++ = c;
1423 continue;
1424
1425 invalid_code:
1426 src = src_base;
1427 consumed_chars = consumed_chars_base;
1428 ONE_MORE_BYTE (c);
1429 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1430 coding->errors++;
1431 }
1432
1433 no_more_source:
1434 coding->consumed_char += consumed_chars_base;
1435 coding->consumed = src_base - coding->source;
1436 coding->charbuf_used = charbuf - coding->charbuf;
1437 }
1438
1439
1440 static bool
1441 encode_coding_utf_8 (struct coding_system *coding)
1442 {
1443 bool multibytep = coding->dst_multibyte;
1444 int *charbuf = coding->charbuf;
1445 int *charbuf_end = charbuf + coding->charbuf_used;
1446 unsigned char *dst = coding->destination + coding->produced;
1447 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1448 ptrdiff_t produced_chars = 0;
1449 int c;
1450
1451 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1452 {
1453 ASSURE_DESTINATION (3);
1454 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1455 CODING_UTF_8_BOM (coding) = utf_without_bom;
1456 }
1457
1458 if (multibytep)
1459 {
1460 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1461
1462 while (charbuf < charbuf_end)
1463 {
1464 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1465
1466 ASSURE_DESTINATION (safe_room);
1467 c = *charbuf++;
1468 if (CHAR_BYTE8_P (c))
1469 {
1470 c = CHAR_TO_BYTE8 (c);
1471 EMIT_ONE_BYTE (c);
1472 }
1473 else
1474 {
1475 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1476 for (p = str; p < pend; p++)
1477 EMIT_ONE_BYTE (*p);
1478 }
1479 }
1480 }
1481 else
1482 {
1483 int safe_room = MAX_MULTIBYTE_LENGTH;
1484
1485 while (charbuf < charbuf_end)
1486 {
1487 ASSURE_DESTINATION (safe_room);
1488 c = *charbuf++;
1489 if (CHAR_BYTE8_P (c))
1490 *dst++ = CHAR_TO_BYTE8 (c);
1491 else
1492 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1493 produced_chars++;
1494 }
1495 }
1496 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1497 coding->produced_char += produced_chars;
1498 coding->produced = dst - coding->destination;
1499 return 0;
1500 }
1501
1502
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504 Return true if a text is encoded in one of UTF-16 based coding systems. */
1505
1506 #define UTF_16_HIGH_SURROGATE_P(val) \
1507 (((val) & 0xFC00) == 0xD800)
1508
1509 #define UTF_16_LOW_SURROGATE_P(val) \
1510 (((val) & 0xFC00) == 0xDC00)
1511
1512
1513 static bool
1514 detect_coding_utf_16 (struct coding_system *coding,
1515 struct coding_detection_info *detect_info)
1516 {
1517 const unsigned char *src = coding->source;
1518 const unsigned char *src_end = coding->source + coding->src_bytes;
1519 bool multibytep = coding->src_multibyte;
1520 int c1, c2;
1521
1522 detect_info->checked |= CATEGORY_MASK_UTF_16;
1523 if (coding->mode & CODING_MODE_LAST_BLOCK
1524 && (coding->src_chars & 1))
1525 {
1526 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1527 return 0;
1528 }
1529
1530 TWO_MORE_BYTES (c1, c2);
1531 if ((c1 == 0xFF) && (c2 == 0xFE))
1532 {
1533 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1534 | CATEGORY_MASK_UTF_16_AUTO);
1535 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1536 | CATEGORY_MASK_UTF_16_BE_NOSIG
1537 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1538 }
1539 else if ((c1 == 0xFE) && (c2 == 0xFF))
1540 {
1541 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1542 | CATEGORY_MASK_UTF_16_AUTO);
1543 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1544 | CATEGORY_MASK_UTF_16_BE_NOSIG
1545 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1546 }
1547 else if (c2 < 0)
1548 {
1549 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1550 return 0;
1551 }
1552 else
1553 {
1554 /* We check the dispersion of Eth and Oth bytes where E is even and
1555 O is odd. If both are high, we assume binary data.*/
1556 unsigned char e[256], o[256];
1557 unsigned e_num = 1, o_num = 1;
1558
1559 memset (e, 0, 256);
1560 memset (o, 0, 256);
1561 e[c1] = 1;
1562 o[c2] = 1;
1563
1564 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1565 |CATEGORY_MASK_UTF_16_BE
1566 | CATEGORY_MASK_UTF_16_LE);
1567
1568 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1569 != CATEGORY_MASK_UTF_16)
1570 {
1571 TWO_MORE_BYTES (c1, c2);
1572 if (c2 < 0)
1573 break;
1574 if (! e[c1])
1575 {
1576 e[c1] = 1;
1577 e_num++;
1578 if (e_num >= 128)
1579 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1580 }
1581 if (! o[c2])
1582 {
1583 o[c2] = 1;
1584 o_num++;
1585 if (o_num >= 128)
1586 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1587 }
1588 }
1589 return 0;
1590 }
1591
1592 no_more_source:
1593 return 1;
1594 }
1595
1596 static void
1597 decode_coding_utf_16 (struct coding_system *coding)
1598 {
1599 const unsigned char *src = coding->source + coding->consumed;
1600 const unsigned char *src_end = coding->source + coding->src_bytes;
1601 const unsigned char *src_base;
1602 int *charbuf = coding->charbuf + coding->charbuf_used;
1603 /* We may produces at most 3 chars in one loop. */
1604 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1605 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1606 bool multibytep = coding->src_multibyte;
1607 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1608 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1609 int surrogate = CODING_UTF_16_SURROGATE (coding);
1610 bool eol_dos
1611 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1612 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1613
1614 if (bom == utf_with_bom)
1615 {
1616 int c, c1, c2;
1617
1618 src_base = src;
1619 ONE_MORE_BYTE (c1);
1620 ONE_MORE_BYTE (c2);
1621 c = (c1 << 8) | c2;
1622
1623 if (endian == utf_16_big_endian
1624 ? c != 0xFEFF : c != 0xFFFE)
1625 {
1626 /* The first two bytes are not BOM. Treat them as bytes
1627 for a normal character. */
1628 src = src_base;
1629 coding->errors++;
1630 }
1631 CODING_UTF_16_BOM (coding) = utf_without_bom;
1632 }
1633 else if (bom == utf_detect_bom)
1634 {
1635 /* We have already tried to detect BOM and failed in
1636 detect_coding. */
1637 CODING_UTF_16_BOM (coding) = utf_without_bom;
1638 }
1639
1640 while (1)
1641 {
1642 int c, c1, c2;
1643
1644 src_base = src;
1645 consumed_chars_base = consumed_chars;
1646
1647 if (charbuf >= charbuf_end)
1648 {
1649 if (byte_after_cr1 >= 0)
1650 src_base -= 2;
1651 break;
1652 }
1653
1654 if (byte_after_cr1 >= 0)
1655 c1 = byte_after_cr1, byte_after_cr1 = -1;
1656 else
1657 ONE_MORE_BYTE (c1);
1658 if (c1 < 0)
1659 {
1660 *charbuf++ = -c1;
1661 continue;
1662 }
1663 if (byte_after_cr2 >= 0)
1664 c2 = byte_after_cr2, byte_after_cr2 = -1;
1665 else
1666 ONE_MORE_BYTE (c2);
1667 if (c2 < 0)
1668 {
1669 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1670 *charbuf++ = -c2;
1671 continue;
1672 }
1673 c = (endian == utf_16_big_endian
1674 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1675
1676 if (surrogate)
1677 {
1678 if (! UTF_16_LOW_SURROGATE_P (c))
1679 {
1680 if (endian == utf_16_big_endian)
1681 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1682 else
1683 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1684 *charbuf++ = c1;
1685 *charbuf++ = c2;
1686 coding->errors++;
1687 if (UTF_16_HIGH_SURROGATE_P (c))
1688 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1689 else
1690 *charbuf++ = c;
1691 }
1692 else
1693 {
1694 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1695 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1696 *charbuf++ = 0x10000 + c;
1697 }
1698 }
1699 else
1700 {
1701 if (UTF_16_HIGH_SURROGATE_P (c))
1702 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1703 else
1704 {
1705 if (eol_dos && c == '\r')
1706 {
1707 ONE_MORE_BYTE (byte_after_cr1);
1708 ONE_MORE_BYTE (byte_after_cr2);
1709 }
1710 *charbuf++ = c;
1711 }
1712 }
1713 }
1714
1715 no_more_source:
1716 coding->consumed_char += consumed_chars_base;
1717 coding->consumed = src_base - coding->source;
1718 coding->charbuf_used = charbuf - coding->charbuf;
1719 }
1720
1721 static bool
1722 encode_coding_utf_16 (struct coding_system *coding)
1723 {
1724 bool multibytep = coding->dst_multibyte;
1725 int *charbuf = coding->charbuf;
1726 int *charbuf_end = charbuf + coding->charbuf_used;
1727 unsigned char *dst = coding->destination + coding->produced;
1728 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1729 int safe_room = 8;
1730 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1731 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1732 ptrdiff_t produced_chars = 0;
1733 int c;
1734
1735 if (bom != utf_without_bom)
1736 {
1737 ASSURE_DESTINATION (safe_room);
1738 if (big_endian)
1739 EMIT_TWO_BYTES (0xFE, 0xFF);
1740 else
1741 EMIT_TWO_BYTES (0xFF, 0xFE);
1742 CODING_UTF_16_BOM (coding) = utf_without_bom;
1743 }
1744
1745 while (charbuf < charbuf_end)
1746 {
1747 ASSURE_DESTINATION (safe_room);
1748 c = *charbuf++;
1749 if (c > MAX_UNICODE_CHAR)
1750 c = coding->default_char;
1751
1752 if (c < 0x10000)
1753 {
1754 if (big_endian)
1755 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1756 else
1757 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1758 }
1759 else
1760 {
1761 int c1, c2;
1762
1763 c -= 0x10000;
1764 c1 = (c >> 10) + 0xD800;
1765 c2 = (c & 0x3FF) + 0xDC00;
1766 if (big_endian)
1767 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1768 else
1769 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1770 }
1771 }
1772 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1773 coding->produced = dst - coding->destination;
1774 coding->produced_char += produced_chars;
1775 return 0;
1776 }
1777
1778 \f
1779 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1780
1781 /* Emacs' internal format for representation of multiple character
1782 sets is a kind of multi-byte encoding, i.e. characters are
1783 represented by variable-length sequences of one-byte codes.
1784
1785 ASCII characters and control characters (e.g. `tab', `newline') are
1786 represented by one-byte sequences which are their ASCII codes, in
1787 the range 0x00 through 0x7F.
1788
1789 8-bit characters of the range 0x80..0x9F are represented by
1790 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1791 code + 0x20).
1792
1793 8-bit characters of the range 0xA0..0xFF are represented by
1794 one-byte sequences which are their 8-bit code.
1795
1796 The other characters are represented by a sequence of `base
1797 leading-code', optional `extended leading-code', and one or two
1798 `position-code's. The length of the sequence is determined by the
1799 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1800 whereas extended leading-code and position-code take the range 0xA0
1801 through 0xFF. See `charset.h' for more details about leading-code
1802 and position-code.
1803
1804 --- CODE RANGE of Emacs' internal format ---
1805 character set range
1806 ------------- -----
1807 ascii 0x00..0x7F
1808 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1809 eight-bit-graphic 0xA0..0xBF
1810 ELSE 0x81..0x9D + [0xA0..0xFF]+
1811 ---------------------------------------------
1812
1813 As this is the internal character representation, the format is
1814 usually not used externally (i.e. in a file or in a data sent to a
1815 process). But, it is possible to have a text externally in this
1816 format (i.e. by encoding by the coding system `emacs-mule').
1817
1818 In that case, a sequence of one-byte codes has a slightly different
1819 form.
1820
1821 At first, all characters in eight-bit-control are represented by
1822 one-byte sequences which are their 8-bit code.
1823
1824 Next, character composition data are represented by the byte
1825 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1826 where,
1827 METHOD is 0xF2 plus one of composition method (enum
1828 composition_method),
1829
1830 BYTES is 0xA0 plus a byte length of this composition data,
1831
1832 CHARS is 0xA0 plus a number of characters composed by this
1833 data,
1834
1835 COMPONENTs are characters of multibyte form or composition
1836 rules encoded by two-byte of ASCII codes.
1837
1838 In addition, for backward compatibility, the following formats are
1839 also recognized as composition data on decoding.
1840
1841 0x80 MSEQ ...
1842 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1843
1844 Here,
1845 MSEQ is a multibyte form but in these special format:
1846 ASCII: 0xA0 ASCII_CODE+0x80,
1847 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1848 RULE is a one byte code of the range 0xA0..0xF0 that
1849 represents a composition rule.
1850 */
1851
1852 char emacs_mule_bytes[256];
1853
1854
1855 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1856 Return true if a text is encoded in 'emacs-mule'. */
1857
1858 static bool
1859 detect_coding_emacs_mule (struct coding_system *coding,
1860 struct coding_detection_info *detect_info)
1861 {
1862 const unsigned char *src = coding->source, *src_base;
1863 const unsigned char *src_end = coding->source + coding->src_bytes;
1864 bool multibytep = coding->src_multibyte;
1865 ptrdiff_t consumed_chars = 0;
1866 int c;
1867 int found = 0;
1868
1869 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1870 /* A coding system of this category is always ASCII compatible. */
1871 src += coding->head_ascii;
1872
1873 while (1)
1874 {
1875 src_base = src;
1876 ONE_MORE_BYTE (c);
1877 if (c < 0)
1878 continue;
1879 if (c == 0x80)
1880 {
1881 /* Perhaps the start of composite character. We simply skip
1882 it because analyzing it is too heavy for detecting. But,
1883 at least, we check that the composite character
1884 constitutes of more than 4 bytes. */
1885 const unsigned char *src_start;
1886
1887 repeat:
1888 src_start = src;
1889 do
1890 {
1891 ONE_MORE_BYTE (c);
1892 }
1893 while (c >= 0xA0);
1894
1895 if (src - src_start <= 4)
1896 break;
1897 found = CATEGORY_MASK_EMACS_MULE;
1898 if (c == 0x80)
1899 goto repeat;
1900 }
1901
1902 if (c < 0x80)
1903 {
1904 if (c < 0x20
1905 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1906 break;
1907 }
1908 else
1909 {
1910 int more_bytes = emacs_mule_bytes[c] - 1;
1911
1912 while (more_bytes > 0)
1913 {
1914 ONE_MORE_BYTE (c);
1915 if (c < 0xA0)
1916 {
1917 src--; /* Unread the last byte. */
1918 break;
1919 }
1920 more_bytes--;
1921 }
1922 if (more_bytes != 0)
1923 break;
1924 found = CATEGORY_MASK_EMACS_MULE;
1925 }
1926 }
1927 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1928 return 0;
1929
1930 no_more_source:
1931 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1932 {
1933 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1934 return 0;
1935 }
1936 detect_info->found |= found;
1937 return 1;
1938 }
1939
1940
1941 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1942 character. If CMP_STATUS indicates that we must expect MSEQ or
1943 RULE described above, decode it and return the negative value of
1944 the decoded character or rule. If an invalid byte is found, return
1945 -1. If SRC is too short, return -2. */
1946
1947 static int
1948 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1949 int *nbytes, int *nchars, int *id,
1950 struct composition_status *cmp_status)
1951 {
1952 const unsigned char *src_end = coding->source + coding->src_bytes;
1953 const unsigned char *src_base = src;
1954 bool multibytep = coding->src_multibyte;
1955 int charset_ID;
1956 unsigned code;
1957 int c;
1958 int consumed_chars = 0;
1959 bool mseq_found = 0;
1960
1961 ONE_MORE_BYTE (c);
1962 if (c < 0)
1963 {
1964 c = -c;
1965 charset_ID = emacs_mule_charset[0];
1966 }
1967 else
1968 {
1969 if (c >= 0xA0)
1970 {
1971 if (cmp_status->state != COMPOSING_NO
1972 && cmp_status->old_form)
1973 {
1974 if (cmp_status->state == COMPOSING_CHAR)
1975 {
1976 if (c == 0xA0)
1977 {
1978 ONE_MORE_BYTE (c);
1979 c -= 0x80;
1980 if (c < 0)
1981 goto invalid_code;
1982 }
1983 else
1984 c -= 0x20;
1985 mseq_found = 1;
1986 }
1987 else
1988 {
1989 *nbytes = src - src_base;
1990 *nchars = consumed_chars;
1991 return -c;
1992 }
1993 }
1994 else
1995 goto invalid_code;
1996 }
1997
1998 switch (emacs_mule_bytes[c])
1999 {
2000 case 2:
2001 if ((charset_ID = emacs_mule_charset[c]) < 0)
2002 goto invalid_code;
2003 ONE_MORE_BYTE (c);
2004 if (c < 0xA0)
2005 goto invalid_code;
2006 code = c & 0x7F;
2007 break;
2008
2009 case 3:
2010 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2011 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2012 {
2013 ONE_MORE_BYTE (c);
2014 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2015 goto invalid_code;
2016 ONE_MORE_BYTE (c);
2017 if (c < 0xA0)
2018 goto invalid_code;
2019 code = c & 0x7F;
2020 }
2021 else
2022 {
2023 if ((charset_ID = emacs_mule_charset[c]) < 0)
2024 goto invalid_code;
2025 ONE_MORE_BYTE (c);
2026 if (c < 0xA0)
2027 goto invalid_code;
2028 code = (c & 0x7F) << 8;
2029 ONE_MORE_BYTE (c);
2030 if (c < 0xA0)
2031 goto invalid_code;
2032 code |= c & 0x7F;
2033 }
2034 break;
2035
2036 case 4:
2037 ONE_MORE_BYTE (c);
2038 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2039 goto invalid_code;
2040 ONE_MORE_BYTE (c);
2041 if (c < 0xA0)
2042 goto invalid_code;
2043 code = (c & 0x7F) << 8;
2044 ONE_MORE_BYTE (c);
2045 if (c < 0xA0)
2046 goto invalid_code;
2047 code |= c & 0x7F;
2048 break;
2049
2050 case 1:
2051 code = c;
2052 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2053 break;
2054
2055 default:
2056 emacs_abort ();
2057 }
2058 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2059 CHARSET_FROM_ID (charset_ID), code, c);
2060 if (c < 0)
2061 goto invalid_code;
2062 }
2063 *nbytes = src - src_base;
2064 *nchars = consumed_chars;
2065 if (id)
2066 *id = charset_ID;
2067 return (mseq_found ? -c : c);
2068
2069 no_more_source:
2070 return -2;
2071
2072 invalid_code:
2073 return -1;
2074 }
2075
2076
2077 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2078
2079 /* Handle these composition sequence ('|': the end of header elements,
2080 BYTES and CHARS >= 0xA0):
2081
2082 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2083 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2084 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2085
2086 and these old form:
2087
2088 (4) relative composition: 0x80 | MSEQ ... MSEQ
2089 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2090
2091 When the starter 0x80 and the following header elements are found,
2092 this annotation header is produced.
2093
2094 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2095
2096 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2097 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2098
2099 Then, upon reading the following elements, these codes are produced
2100 until the composition end is found:
2101
2102 (1) CHAR ... CHAR
2103 (2) ALT ... ALT CHAR ... CHAR
2104 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2105 (4) CHAR ... CHAR
2106 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2107
2108 When the composition end is found, LENGTH and NCHARS in the
2109 annotation header is updated as below:
2110
2111 (1) LENGTH: unchanged, NCHARS: unchanged
2112 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2113 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2114 (4) LENGTH: unchanged, NCHARS: number of CHARs
2115 (5) LENGTH: unchanged, NCHARS: number of CHARs
2116
2117 If an error is found while composing, the annotation header is
2118 changed to the original composition header (plus filler -1s) as
2119 below:
2120
2121 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2122 (5) [ 0x80 0xFF -1 -1- -1 ]
2123
2124 and the sequence [ -2 DECODED-RULE ] is changed to the original
2125 byte sequence as below:
2126 o the original byte sequence is B: [ B -1 ]
2127 o the original byte sequence is B1 B2: [ B1 B2 ]
2128
2129 Most of the routines are implemented by macros because many
2130 variables and labels in the caller decode_coding_emacs_mule must be
2131 accessible, and they are usually called just once (thus doesn't
2132 increase the size of compiled object). */
2133
2134 /* Decode a composition rule represented by C as a component of
2135 composition sequence of Emacs 20 style. Set RULE to the decoded
2136 rule. */
2137
2138 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2139 do { \
2140 int gref, nref; \
2141 \
2142 c -= 0xA0; \
2143 if (c < 0 || c >= 81) \
2144 goto invalid_code; \
2145 gref = c / 9, nref = c % 9; \
2146 if (gref == 4) gref = 10; \
2147 if (nref == 4) nref = 10; \
2148 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2149 } while (0)
2150
2151
2152 /* Decode a composition rule represented by C and the following byte
2153 at SRC as a component of composition sequence of Emacs 21 style.
2154 Set RULE to the decoded rule. */
2155
2156 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2157 do { \
2158 int gref, nref; \
2159 \
2160 gref = c - 0x20; \
2161 if (gref < 0 || gref >= 81) \
2162 goto invalid_code; \
2163 ONE_MORE_BYTE (c); \
2164 nref = c - 0x20; \
2165 if (nref < 0 || nref >= 81) \
2166 goto invalid_code; \
2167 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2168 } while (0)
2169
2170
2171 /* Start of Emacs 21 style format. The first three bytes at SRC are
2172 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2173 byte length of this composition information, CHARS is the number of
2174 characters composed by this composition. */
2175
2176 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2177 do { \
2178 enum composition_method method = c - 0xF2; \
2179 int nbytes, nchars; \
2180 \
2181 ONE_MORE_BYTE (c); \
2182 if (c < 0) \
2183 goto invalid_code; \
2184 nbytes = c - 0xA0; \
2185 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2186 goto invalid_code; \
2187 ONE_MORE_BYTE (c); \
2188 nchars = c - 0xA0; \
2189 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2190 goto invalid_code; \
2191 cmp_status->old_form = 0; \
2192 cmp_status->method = method; \
2193 if (method == COMPOSITION_RELATIVE) \
2194 cmp_status->state = COMPOSING_CHAR; \
2195 else \
2196 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2197 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2198 cmp_status->nchars = nchars; \
2199 cmp_status->ncomps = nbytes - 4; \
2200 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2201 } while (0)
2202
2203
2204 /* Start of Emacs 20 style format for relative composition. */
2205
2206 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2207 do { \
2208 cmp_status->old_form = 1; \
2209 cmp_status->method = COMPOSITION_RELATIVE; \
2210 cmp_status->state = COMPOSING_CHAR; \
2211 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2212 cmp_status->nchars = cmp_status->ncomps = 0; \
2213 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2214 } while (0)
2215
2216
2217 /* Start of Emacs 20 style format for rule-base composition. */
2218
2219 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2220 do { \
2221 cmp_status->old_form = 1; \
2222 cmp_status->method = COMPOSITION_WITH_RULE; \
2223 cmp_status->state = COMPOSING_CHAR; \
2224 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2225 cmp_status->nchars = cmp_status->ncomps = 0; \
2226 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2227 } while (0)
2228
2229
2230 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2231 do { \
2232 const unsigned char *current_src = src; \
2233 \
2234 ONE_MORE_BYTE (c); \
2235 if (c < 0) \
2236 goto invalid_code; \
2237 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2238 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2239 DECODE_EMACS_MULE_21_COMPOSITION (); \
2240 else if (c < 0xA0) \
2241 goto invalid_code; \
2242 else if (c < 0xC0) \
2243 { \
2244 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2245 /* Re-read C as a composition component. */ \
2246 src = current_src; \
2247 } \
2248 else if (c == 0xFF) \
2249 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2250 else \
2251 goto invalid_code; \
2252 } while (0)
2253
2254 #define EMACS_MULE_COMPOSITION_END() \
2255 do { \
2256 int idx = - cmp_status->length; \
2257 \
2258 if (cmp_status->old_form) \
2259 charbuf[idx + 2] = cmp_status->nchars; \
2260 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2261 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2262 cmp_status->state = COMPOSING_NO; \
2263 } while (0)
2264
2265
2266 static int
2267 emacs_mule_finish_composition (int *charbuf,
2268 struct composition_status *cmp_status)
2269 {
2270 int idx = - cmp_status->length;
2271 int new_chars;
2272
2273 if (cmp_status->old_form && cmp_status->nchars > 0)
2274 {
2275 charbuf[idx + 2] = cmp_status->nchars;
2276 new_chars = 0;
2277 if (cmp_status->method == COMPOSITION_WITH_RULE
2278 && cmp_status->state == COMPOSING_CHAR)
2279 {
2280 /* The last rule was invalid. */
2281 int rule = charbuf[-1] + 0xA0;
2282
2283 charbuf[-2] = BYTE8_TO_CHAR (rule);
2284 charbuf[-1] = -1;
2285 new_chars = 1;
2286 }
2287 }
2288 else
2289 {
2290 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2291
2292 if (cmp_status->method == COMPOSITION_WITH_RULE)
2293 {
2294 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2295 charbuf[idx++] = -3;
2296 charbuf[idx++] = 0;
2297 new_chars = 1;
2298 }
2299 else
2300 {
2301 int nchars = charbuf[idx + 1] + 0xA0;
2302 int nbytes = charbuf[idx + 2] + 0xA0;
2303
2304 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2305 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2306 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2307 charbuf[idx++] = -1;
2308 new_chars = 4;
2309 }
2310 }
2311 cmp_status->state = COMPOSING_NO;
2312 return new_chars;
2313 }
2314
2315 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2316 do { \
2317 if (cmp_status->state != COMPOSING_NO) \
2318 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2319 } while (0)
2320
2321
2322 static void
2323 decode_coding_emacs_mule (struct coding_system *coding)
2324 {
2325 const unsigned char *src = coding->source + coding->consumed;
2326 const unsigned char *src_end = coding->source + coding->src_bytes;
2327 const unsigned char *src_base;
2328 int *charbuf = coding->charbuf + coding->charbuf_used;
2329 /* We may produce two annotations (charset and composition) in one
2330 loop and one more charset annotation at the end. */
2331 int *charbuf_end
2332 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2333 /* We can produce up to 2 characters in a loop. */
2334 - 1;
2335 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2336 bool multibytep = coding->src_multibyte;
2337 ptrdiff_t char_offset = coding->produced_char;
2338 ptrdiff_t last_offset = char_offset;
2339 int last_id = charset_ascii;
2340 bool eol_dos
2341 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2342 int byte_after_cr = -1;
2343 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2344
2345 if (cmp_status->state != COMPOSING_NO)
2346 {
2347 int i;
2348
2349 if (charbuf_end - charbuf < cmp_status->length)
2350 emacs_abort ();
2351 for (i = 0; i < cmp_status->length; i++)
2352 *charbuf++ = cmp_status->carryover[i];
2353 coding->annotated = 1;
2354 }
2355
2356 while (1)
2357 {
2358 int c, id IF_LINT (= 0);
2359
2360 src_base = src;
2361 consumed_chars_base = consumed_chars;
2362
2363 if (charbuf >= charbuf_end)
2364 {
2365 if (byte_after_cr >= 0)
2366 src_base--;
2367 break;
2368 }
2369
2370 if (byte_after_cr >= 0)
2371 c = byte_after_cr, byte_after_cr = -1;
2372 else
2373 ONE_MORE_BYTE (c);
2374
2375 if (c < 0 || c == 0x80)
2376 {
2377 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378 if (c < 0)
2379 {
2380 *charbuf++ = -c;
2381 char_offset++;
2382 }
2383 else
2384 DECODE_EMACS_MULE_COMPOSITION_START ();
2385 continue;
2386 }
2387
2388 if (c < 0x80)
2389 {
2390 if (eol_dos && c == '\r')
2391 ONE_MORE_BYTE (byte_after_cr);
2392 id = charset_ascii;
2393 if (cmp_status->state != COMPOSING_NO)
2394 {
2395 if (cmp_status->old_form)
2396 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2397 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2398 cmp_status->ncomps--;
2399 }
2400 }
2401 else
2402 {
2403 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2404 /* emacs_mule_char can load a charset map from a file, which
2405 allocates a large structure and might cause buffer text
2406 to be relocated as result. Thus, we need to remember the
2407 original pointer to buffer text, and fix up all related
2408 pointers after the call. */
2409 const unsigned char *orig = coding->source;
2410 ptrdiff_t offset;
2411
2412 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2413 cmp_status);
2414 offset = coding->source - orig;
2415 if (offset)
2416 {
2417 src += offset;
2418 src_base += offset;
2419 src_end += offset;
2420 }
2421 if (c < 0)
2422 {
2423 if (c == -1)
2424 goto invalid_code;
2425 if (c == -2)
2426 break;
2427 }
2428 src = src_base + nbytes;
2429 consumed_chars = consumed_chars_base + nchars;
2430 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2431 cmp_status->ncomps -= nchars;
2432 }
2433
2434 /* Now if C >= 0, we found a normally encoded character, if C <
2435 0, we found an old-style composition component character or
2436 rule. */
2437
2438 if (cmp_status->state == COMPOSING_NO)
2439 {
2440 if (last_id != id)
2441 {
2442 if (last_id != charset_ascii)
2443 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2444 last_id);
2445 last_id = id;
2446 last_offset = char_offset;
2447 }
2448 *charbuf++ = c;
2449 char_offset++;
2450 }
2451 else if (cmp_status->state == COMPOSING_CHAR)
2452 {
2453 if (cmp_status->old_form)
2454 {
2455 if (c >= 0)
2456 {
2457 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2458 *charbuf++ = c;
2459 char_offset++;
2460 }
2461 else
2462 {
2463 *charbuf++ = -c;
2464 cmp_status->nchars++;
2465 cmp_status->length++;
2466 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2467 EMACS_MULE_COMPOSITION_END ();
2468 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2469 cmp_status->state = COMPOSING_RULE;
2470 }
2471 }
2472 else
2473 {
2474 *charbuf++ = c;
2475 cmp_status->length++;
2476 cmp_status->nchars--;
2477 if (cmp_status->nchars == 0)
2478 EMACS_MULE_COMPOSITION_END ();
2479 }
2480 }
2481 else if (cmp_status->state == COMPOSING_RULE)
2482 {
2483 int rule;
2484
2485 if (c >= 0)
2486 {
2487 EMACS_MULE_COMPOSITION_END ();
2488 *charbuf++ = c;
2489 char_offset++;
2490 }
2491 else
2492 {
2493 c = -c;
2494 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2495 if (rule < 0)
2496 goto invalid_code;
2497 *charbuf++ = -2;
2498 *charbuf++ = rule;
2499 cmp_status->length += 2;
2500 cmp_status->state = COMPOSING_CHAR;
2501 }
2502 }
2503 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2504 {
2505 *charbuf++ = c;
2506 cmp_status->length++;
2507 if (cmp_status->ncomps == 0)
2508 cmp_status->state = COMPOSING_CHAR;
2509 else if (cmp_status->ncomps > 0)
2510 {
2511 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2512 cmp_status->state = COMPOSING_COMPONENT_RULE;
2513 }
2514 else
2515 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516 }
2517 else /* COMPOSING_COMPONENT_RULE */
2518 {
2519 int rule;
2520
2521 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2522 if (rule < 0)
2523 goto invalid_code;
2524 *charbuf++ = -2;
2525 *charbuf++ = rule;
2526 cmp_status->length += 2;
2527 cmp_status->ncomps--;
2528 if (cmp_status->ncomps > 0)
2529 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2530 else
2531 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2532 }
2533 continue;
2534
2535 invalid_code:
2536 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2537 src = src_base;
2538 consumed_chars = consumed_chars_base;
2539 ONE_MORE_BYTE (c);
2540 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2541 char_offset++;
2542 coding->errors++;
2543 }
2544
2545 no_more_source:
2546 if (cmp_status->state != COMPOSING_NO)
2547 {
2548 if (coding->mode & CODING_MODE_LAST_BLOCK)
2549 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550 else
2551 {
2552 int i;
2553
2554 charbuf -= cmp_status->length;
2555 for (i = 0; i < cmp_status->length; i++)
2556 cmp_status->carryover[i] = charbuf[i];
2557 }
2558 }
2559 if (last_id != charset_ascii)
2560 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2561 coding->consumed_char += consumed_chars_base;
2562 coding->consumed = src_base - coding->source;
2563 coding->charbuf_used = charbuf - coding->charbuf;
2564 }
2565
2566
2567 #define EMACS_MULE_LEADING_CODES(id, codes) \
2568 do { \
2569 if (id < 0xA0) \
2570 codes[0] = id, codes[1] = 0; \
2571 else if (id < 0xE0) \
2572 codes[0] = 0x9A, codes[1] = id; \
2573 else if (id < 0xF0) \
2574 codes[0] = 0x9B, codes[1] = id; \
2575 else if (id < 0xF5) \
2576 codes[0] = 0x9C, codes[1] = id; \
2577 else \
2578 codes[0] = 0x9D, codes[1] = id; \
2579 } while (0);
2580
2581
2582 static bool
2583 encode_coding_emacs_mule (struct coding_system *coding)
2584 {
2585 bool multibytep = coding->dst_multibyte;
2586 int *charbuf = coding->charbuf;
2587 int *charbuf_end = charbuf + coding->charbuf_used;
2588 unsigned char *dst = coding->destination + coding->produced;
2589 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2590 int safe_room = 8;
2591 ptrdiff_t produced_chars = 0;
2592 Lisp_Object attrs, charset_list;
2593 int c;
2594 int preferred_charset_id = -1;
2595
2596 CODING_GET_INFO (coding, attrs, charset_list);
2597 if (! EQ (charset_list, Vemacs_mule_charset_list))
2598 {
2599 charset_list = Vemacs_mule_charset_list;
2600 ASET (attrs, coding_attr_charset_list, charset_list);
2601 }
2602
2603 while (charbuf < charbuf_end)
2604 {
2605 ASSURE_DESTINATION (safe_room);
2606 c = *charbuf++;
2607
2608 if (c < 0)
2609 {
2610 /* Handle an annotation. */
2611 switch (*charbuf)
2612 {
2613 case CODING_ANNOTATE_COMPOSITION_MASK:
2614 /* Not yet implemented. */
2615 break;
2616 case CODING_ANNOTATE_CHARSET_MASK:
2617 preferred_charset_id = charbuf[3];
2618 if (preferred_charset_id >= 0
2619 && NILP (Fmemq (make_number (preferred_charset_id),
2620 charset_list)))
2621 preferred_charset_id = -1;
2622 break;
2623 default:
2624 emacs_abort ();
2625 }
2626 charbuf += -c - 1;
2627 continue;
2628 }
2629
2630 if (ASCII_CHAR_P (c))
2631 EMIT_ONE_ASCII_BYTE (c);
2632 else if (CHAR_BYTE8_P (c))
2633 {
2634 c = CHAR_TO_BYTE8 (c);
2635 EMIT_ONE_BYTE (c);
2636 }
2637 else
2638 {
2639 struct charset *charset;
2640 unsigned code;
2641 int dimension;
2642 int emacs_mule_id;
2643 unsigned char leading_codes[2];
2644
2645 if (preferred_charset_id >= 0)
2646 {
2647 bool result;
2648
2649 charset = CHARSET_FROM_ID (preferred_charset_id);
2650 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2651 if (result)
2652 code = ENCODE_CHAR (charset, c);
2653 else
2654 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2655 &code, charset);
2656 }
2657 else
2658 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2659 &code, charset);
2660 if (! charset)
2661 {
2662 c = coding->default_char;
2663 if (ASCII_CHAR_P (c))
2664 {
2665 EMIT_ONE_ASCII_BYTE (c);
2666 continue;
2667 }
2668 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2669 &code, charset);
2670 }
2671 dimension = CHARSET_DIMENSION (charset);
2672 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2673 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2674 EMIT_ONE_BYTE (leading_codes[0]);
2675 if (leading_codes[1])
2676 EMIT_ONE_BYTE (leading_codes[1]);
2677 if (dimension == 1)
2678 EMIT_ONE_BYTE (code | 0x80);
2679 else
2680 {
2681 code |= 0x8080;
2682 EMIT_ONE_BYTE (code >> 8);
2683 EMIT_ONE_BYTE (code & 0xFF);
2684 }
2685 }
2686 }
2687 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2688 coding->produced_char += produced_chars;
2689 coding->produced = dst - coding->destination;
2690 return 0;
2691 }
2692
2693 \f
2694 /*** 7. ISO2022 handlers ***/
2695
2696 /* The following note describes the coding system ISO2022 briefly.
2697 Since the intention of this note is to help understand the
2698 functions in this file, some parts are NOT ACCURATE or are OVERLY
2699 SIMPLIFIED. For thorough understanding, please refer to the
2700 original document of ISO2022. This is equivalent to the standard
2701 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2702
2703 ISO2022 provides many mechanisms to encode several character sets
2704 in 7-bit and 8-bit environments. For 7-bit environments, all text
2705 is encoded using bytes less than 128. This may make the encoded
2706 text a little bit longer, but the text passes more easily through
2707 several types of gateway, some of which strip off the MSB (Most
2708 Significant Bit).
2709
2710 There are two kinds of character sets: control character sets and
2711 graphic character sets. The former contain control characters such
2712 as `newline' and `escape' to provide control functions (control
2713 functions are also provided by escape sequences). The latter
2714 contain graphic characters such as 'A' and '-'. Emacs recognizes
2715 two control character sets and many graphic character sets.
2716
2717 Graphic character sets are classified into one of the following
2718 four classes, according to the number of bytes (DIMENSION) and
2719 number of characters in one dimension (CHARS) of the set:
2720 - DIMENSION1_CHARS94
2721 - DIMENSION1_CHARS96
2722 - DIMENSION2_CHARS94
2723 - DIMENSION2_CHARS96
2724
2725 In addition, each character set is assigned an identification tag,
2726 unique for each set, called the "final character" (denoted as <F>
2727 hereafter). The <F> of each character set is decided by ECMA(*)
2728 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2729 (0x30..0x3F are for private use only).
2730
2731 Note (*): ECMA = European Computer Manufacturers Association
2732
2733 Here are examples of graphic character sets [NAME(<F>)]:
2734 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2735 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2736 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2737 o DIMENSION2_CHARS96 -- none for the moment
2738
2739 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2740 C0 [0x00..0x1F] -- control character plane 0
2741 GL [0x20..0x7F] -- graphic character plane 0
2742 C1 [0x80..0x9F] -- control character plane 1
2743 GR [0xA0..0xFF] -- graphic character plane 1
2744
2745 A control character set is directly designated and invoked to C0 or
2746 C1 by an escape sequence. The most common case is that:
2747 - ISO646's control character set is designated/invoked to C0, and
2748 - ISO6429's control character set is designated/invoked to C1,
2749 and usually these designations/invocations are omitted in encoded
2750 text. In a 7-bit environment, only C0 can be used, and a control
2751 character for C1 is encoded by an appropriate escape sequence to
2752 fit into the environment. All control characters for C1 are
2753 defined to have corresponding escape sequences.
2754
2755 A graphic character set is at first designated to one of four
2756 graphic registers (G0 through G3), then these graphic registers are
2757 invoked to GL or GR. These designations and invocations can be
2758 done independently. The most common case is that G0 is invoked to
2759 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2760 these invocations and designations are omitted in encoded text.
2761 In a 7-bit environment, only GL can be used.
2762
2763 When a graphic character set of CHARS94 is invoked to GL, codes
2764 0x20 and 0x7F of the GL area work as control characters SPACE and
2765 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2766 be used.
2767
2768 There are two ways of invocation: locking-shift and single-shift.
2769 With locking-shift, the invocation lasts until the next different
2770 invocation, whereas with single-shift, the invocation affects the
2771 following character only and doesn't affect the locking-shift
2772 state. Invocations are done by the following control characters or
2773 escape sequences:
2774
2775 ----------------------------------------------------------------------
2776 abbrev function cntrl escape seq description
2777 ----------------------------------------------------------------------
2778 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2779 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2780 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2781 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2782 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2783 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2784 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2785 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2786 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2787 ----------------------------------------------------------------------
2788 (*) These are not used by any known coding system.
2789
2790 Control characters for these functions are defined by macros
2791 ISO_CODE_XXX in `coding.h'.
2792
2793 Designations are done by the following escape sequences:
2794 ----------------------------------------------------------------------
2795 escape sequence description
2796 ----------------------------------------------------------------------
2797 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2798 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2799 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2800 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2801 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2802 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2803 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2804 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2805 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2806 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2807 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2808 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2809 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2810 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2811 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2812 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2813 ----------------------------------------------------------------------
2814
2815 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2816 of dimension 1, chars 94, and final character <F>, etc...
2817
2818 Note (*): Although these designations are not allowed in ISO2022,
2819 Emacs accepts them on decoding, and produces them on encoding
2820 CHARS96 character sets in a coding system which is characterized as
2821 7-bit environment, non-locking-shift, and non-single-shift.
2822
2823 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2824 '(' must be omitted. We refer to this as "short-form" hereafter.
2825
2826 Now you may notice that there are a lot of ways of encoding the
2827 same multilingual text in ISO2022. Actually, there exist many
2828 coding systems such as Compound Text (used in X11's inter client
2829 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2830 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2831 localized platforms), and all of these are variants of ISO2022.
2832
2833 In addition to the above, Emacs handles two more kinds of escape
2834 sequences: ISO6429's direction specification and Emacs' private
2835 sequence for specifying character composition.
2836
2837 ISO6429's direction specification takes the following form:
2838 o CSI ']' -- end of the current direction
2839 o CSI '0' ']' -- end of the current direction
2840 o CSI '1' ']' -- start of left-to-right text
2841 o CSI '2' ']' -- start of right-to-left text
2842 The control character CSI (0x9B: control sequence introducer) is
2843 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2844
2845 Character composition specification takes the following form:
2846 o ESC '0' -- start relative composition
2847 o ESC '1' -- end composition
2848 o ESC '2' -- start rule-base composition (*)
2849 o ESC '3' -- start relative composition with alternate chars (**)
2850 o ESC '4' -- start rule-base composition with alternate chars (**)
2851 Since these are not standard escape sequences of any ISO standard,
2852 the use of them with these meanings is restricted to Emacs only.
2853
2854 (*) This form is used only in Emacs 20.7 and older versions,
2855 but newer versions can safely decode it.
2856 (**) This form is used only in Emacs 21.1 and newer versions,
2857 and older versions can't decode it.
2858
2859 Here's a list of example usages of these composition escape
2860 sequences (categorized by `enum composition_method').
2861
2862 COMPOSITION_RELATIVE:
2863 ESC 0 CHAR [ CHAR ] ESC 1
2864 COMPOSITION_WITH_RULE:
2865 ESC 2 CHAR [ RULE CHAR ] ESC 1
2866 COMPOSITION_WITH_ALTCHARS:
2867 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2868 COMPOSITION_WITH_RULE_ALTCHARS:
2869 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2870
2871 static enum iso_code_class_type iso_code_class[256];
2872
2873 #define SAFE_CHARSET_P(coding, id) \
2874 ((id) <= (coding)->max_charset_id \
2875 && (coding)->safe_charsets[id] != 255)
2876
2877 static void
2878 setup_iso_safe_charsets (Lisp_Object attrs)
2879 {
2880 Lisp_Object charset_list, safe_charsets;
2881 Lisp_Object request;
2882 Lisp_Object reg_usage;
2883 Lisp_Object tail;
2884 EMACS_INT reg94, reg96;
2885 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2886 int max_charset_id;
2887
2888 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2889 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2890 && ! EQ (charset_list, Viso_2022_charset_list))
2891 {
2892 charset_list = Viso_2022_charset_list;
2893 ASET (attrs, coding_attr_charset_list, charset_list);
2894 ASET (attrs, coding_attr_safe_charsets, Qnil);
2895 }
2896
2897 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2898 return;
2899
2900 max_charset_id = 0;
2901 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2902 {
2903 int id = XINT (XCAR (tail));
2904 if (max_charset_id < id)
2905 max_charset_id = id;
2906 }
2907
2908 safe_charsets = make_uninit_string (max_charset_id + 1);
2909 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2910 request = AREF (attrs, coding_attr_iso_request);
2911 reg_usage = AREF (attrs, coding_attr_iso_usage);
2912 reg94 = XINT (XCAR (reg_usage));
2913 reg96 = XINT (XCDR (reg_usage));
2914
2915 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2916 {
2917 Lisp_Object id;
2918 Lisp_Object reg;
2919 struct charset *charset;
2920
2921 id = XCAR (tail);
2922 charset = CHARSET_FROM_ID (XINT (id));
2923 reg = Fcdr (Fassq (id, request));
2924 if (! NILP (reg))
2925 SSET (safe_charsets, XINT (id), XINT (reg));
2926 else if (charset->iso_chars_96)
2927 {
2928 if (reg96 < 4)
2929 SSET (safe_charsets, XINT (id), reg96);
2930 }
2931 else
2932 {
2933 if (reg94 < 4)
2934 SSET (safe_charsets, XINT (id), reg94);
2935 }
2936 }
2937 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2938 }
2939
2940
2941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2942 Return true if a text is encoded in one of ISO-2022 based coding
2943 systems. */
2944
2945 static bool
2946 detect_coding_iso_2022 (struct coding_system *coding,
2947 struct coding_detection_info *detect_info)
2948 {
2949 const unsigned char *src = coding->source, *src_base = src;
2950 const unsigned char *src_end = coding->source + coding->src_bytes;
2951 bool multibytep = coding->src_multibyte;
2952 bool single_shifting = 0;
2953 int id;
2954 int c, c1;
2955 ptrdiff_t consumed_chars = 0;
2956 int i;
2957 int rejected = 0;
2958 int found = 0;
2959 int composition_count = -1;
2960
2961 detect_info->checked |= CATEGORY_MASK_ISO;
2962
2963 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2964 {
2965 struct coding_system *this = &(coding_categories[i]);
2966 Lisp_Object attrs, val;
2967
2968 if (this->id < 0)
2969 continue;
2970 attrs = CODING_ID_ATTRS (this->id);
2971 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2972 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2973 setup_iso_safe_charsets (attrs);
2974 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2975 this->max_charset_id = SCHARS (val) - 1;
2976 this->safe_charsets = SDATA (val);
2977 }
2978
2979 /* A coding system of this category is always ASCII compatible. */
2980 src += coding->head_ascii;
2981
2982 while (rejected != CATEGORY_MASK_ISO)
2983 {
2984 src_base = src;
2985 ONE_MORE_BYTE (c);
2986 switch (c)
2987 {
2988 case ISO_CODE_ESC:
2989 if (inhibit_iso_escape_detection)
2990 break;
2991 single_shifting = 0;
2992 ONE_MORE_BYTE (c);
2993 if (c == 'N' || c == 'O')
2994 {
2995 /* ESC <Fe> for SS2 or SS3. */
2996 single_shifting = 1;
2997 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2998 }
2999 else if (c == '1')
3000 {
3001 /* End of composition. */
3002 if (composition_count < 0
3003 || composition_count > MAX_COMPOSITION_COMPONENTS)
3004 /* Invalid */
3005 break;
3006 composition_count = -1;
3007 found |= CATEGORY_MASK_ISO;
3008 }
3009 else if (c >= '0' && c <= '4')
3010 {
3011 /* ESC <Fp> for start/end composition. */
3012 composition_count = 0;
3013 }
3014 else
3015 {
3016 if (c >= '(' && c <= '/')
3017 {
3018 /* Designation sequence for a charset of dimension 1. */
3019 ONE_MORE_BYTE (c1);
3020 if (c1 < ' ' || c1 >= 0x80
3021 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3022 /* Invalid designation sequence. Just ignore. */
3023 break;
3024 }
3025 else if (c == '$')
3026 {
3027 /* Designation sequence for a charset of dimension 2. */
3028 ONE_MORE_BYTE (c);
3029 if (c >= '@' && c <= 'B')
3030 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3031 id = iso_charset_table[1][0][c];
3032 else if (c >= '(' && c <= '/')
3033 {
3034 ONE_MORE_BYTE (c1);
3035 if (c1 < ' ' || c1 >= 0x80
3036 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3037 /* Invalid designation sequence. Just ignore. */
3038 break;
3039 }
3040 else
3041 /* Invalid designation sequence. Just ignore it. */
3042 break;
3043 }
3044 else
3045 {
3046 /* Invalid escape sequence. Just ignore it. */
3047 break;
3048 }
3049
3050 /* We found a valid designation sequence for CHARSET. */
3051 rejected |= CATEGORY_MASK_ISO_8BIT;
3052 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3053 id))
3054 found |= CATEGORY_MASK_ISO_7;
3055 else
3056 rejected |= CATEGORY_MASK_ISO_7;
3057 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3058 id))
3059 found |= CATEGORY_MASK_ISO_7_TIGHT;
3060 else
3061 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3062 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3063 id))
3064 found |= CATEGORY_MASK_ISO_7_ELSE;
3065 else
3066 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3067 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3068 id))
3069 found |= CATEGORY_MASK_ISO_8_ELSE;
3070 else
3071 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3072 }
3073 break;
3074
3075 case ISO_CODE_SO:
3076 case ISO_CODE_SI:
3077 /* Locking shift out/in. */
3078 if (inhibit_iso_escape_detection)
3079 break;
3080 single_shifting = 0;
3081 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3082 break;
3083
3084 case ISO_CODE_CSI:
3085 /* Control sequence introducer. */
3086 single_shifting = 0;
3087 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088 found |= CATEGORY_MASK_ISO_8_ELSE;
3089 goto check_extra_latin;
3090
3091 case ISO_CODE_SS2:
3092 case ISO_CODE_SS3:
3093 /* Single shift. */
3094 if (inhibit_iso_escape_detection)
3095 break;
3096 single_shifting = 0;
3097 rejected |= CATEGORY_MASK_ISO_7BIT;
3098 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3099 & CODING_ISO_FLAG_SINGLE_SHIFT)
3100 {
3101 found |= CATEGORY_MASK_ISO_8_1;
3102 single_shifting = 1;
3103 }
3104 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3105 & CODING_ISO_FLAG_SINGLE_SHIFT)
3106 {
3107 found |= CATEGORY_MASK_ISO_8_2;
3108 single_shifting = 1;
3109 }
3110 if (single_shifting)
3111 break;
3112 goto check_extra_latin;
3113
3114 default:
3115 if (c < 0)
3116 continue;
3117 if (c < 0x80)
3118 {
3119 if (composition_count >= 0)
3120 composition_count++;
3121 single_shifting = 0;
3122 break;
3123 }
3124 if (c >= 0xA0)
3125 {
3126 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3127 found |= CATEGORY_MASK_ISO_8_1;
3128 /* Check the length of succeeding codes of the range
3129 0xA0..0FF. If the byte length is even, we include
3130 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3131 only when we are not single shifting. */
3132 if (! single_shifting
3133 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3134 {
3135 int len = 1;
3136 while (src < src_end)
3137 {
3138 src_base = src;
3139 ONE_MORE_BYTE (c);
3140 if (c < 0xA0)
3141 {
3142 src = src_base;
3143 break;
3144 }
3145 len++;
3146 }
3147
3148 if (len & 1 && src < src_end)
3149 {
3150 rejected |= CATEGORY_MASK_ISO_8_2;
3151 if (composition_count >= 0)
3152 composition_count += len;
3153 }
3154 else
3155 {
3156 found |= CATEGORY_MASK_ISO_8_2;
3157 if (composition_count >= 0)
3158 composition_count += len / 2;
3159 }
3160 }
3161 break;
3162 }
3163 check_extra_latin:
3164 if (! VECTORP (Vlatin_extra_code_table)
3165 || NILP (AREF (Vlatin_extra_code_table, c)))
3166 {
3167 rejected = CATEGORY_MASK_ISO;
3168 break;
3169 }
3170 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3171 & CODING_ISO_FLAG_LATIN_EXTRA)
3172 found |= CATEGORY_MASK_ISO_8_1;
3173 else
3174 rejected |= CATEGORY_MASK_ISO_8_1;
3175 rejected |= CATEGORY_MASK_ISO_8_2;
3176 break;
3177 }
3178 }
3179 detect_info->rejected |= CATEGORY_MASK_ISO;
3180 return 0;
3181
3182 no_more_source:
3183 detect_info->rejected |= rejected;
3184 detect_info->found |= (found & ~rejected);
3185 return 1;
3186 }
3187
3188
3189 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3190 escape sequence should be kept. */
3191 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3192 do { \
3193 int id, prev; \
3194 \
3195 if (final < '0' || final >= 128 \
3196 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3197 || !SAFE_CHARSET_P (coding, id)) \
3198 { \
3199 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3200 chars_96 = -1; \
3201 break; \
3202 } \
3203 prev = CODING_ISO_DESIGNATION (coding, reg); \
3204 if (id == charset_jisx0201_roman) \
3205 { \
3206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3207 id = charset_ascii; \
3208 } \
3209 else if (id == charset_jisx0208_1978) \
3210 { \
3211 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3212 id = charset_jisx0208; \
3213 } \
3214 CODING_ISO_DESIGNATION (coding, reg) = id; \
3215 /* If there was an invalid designation to REG previously, and this \
3216 designation is ASCII to REG, we should keep this designation \
3217 sequence. */ \
3218 if (prev == -2 && id == charset_ascii) \
3219 chars_96 = -1; \
3220 } while (0)
3221
3222
3223 /* Handle these composition sequence (ALT: alternate char):
3224
3225 (1) relative composition: ESC 0 CHAR ... ESC 1
3226 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3227 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3228 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3229
3230 When the start sequence (ESC 0/2/3/4) is found, this annotation
3231 header is produced.
3232
3233 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3234
3235 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3236 produced until the end sequence (ESC 1) is found:
3237
3238 (1) CHAR ... CHAR
3239 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3240 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3241 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3242
3243 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3244 annotation header is updated as below:
3245
3246 (1) LENGTH: unchanged, NCHARS: number of CHARs
3247 (2) LENGTH: unchanged, NCHARS: number of CHARs
3248 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3249 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3250
3251 If an error is found while composing, the annotation header is
3252 changed to:
3253
3254 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3255
3256 and the sequence [ -2 DECODED-RULE ] is changed to the original
3257 byte sequence as below:
3258 o the original byte sequence is B: [ B -1 ]
3259 o the original byte sequence is B1 B2: [ B1 B2 ]
3260 and the sequence [ -1 -1 ] is changed to the original byte
3261 sequence:
3262 [ ESC '0' ]
3263 */
3264
3265 /* Decode a composition rule C1 and maybe one more byte from the
3266 source, and set RULE to the encoded composition rule. If the rule
3267 is invalid, goto invalid_code. */
3268
3269 #define DECODE_COMPOSITION_RULE(rule) \
3270 do { \
3271 rule = c1 - 32; \
3272 if (rule < 0) \
3273 goto invalid_code; \
3274 if (rule < 81) /* old format (before ver.21) */ \
3275 { \
3276 int gref = (rule) / 9; \
3277 int nref = (rule) % 9; \
3278 if (gref == 4) gref = 10; \
3279 if (nref == 4) nref = 10; \
3280 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3281 } \
3282 else /* new format (after ver.21) */ \
3283 { \
3284 int b; \
3285 \
3286 ONE_MORE_BYTE (b); \
3287 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3288 goto invalid_code; \
3289 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3290 rule += 0x100; /* Distinguish it from the old format. */ \
3291 } \
3292 } while (0)
3293
3294 #define ENCODE_COMPOSITION_RULE(rule) \
3295 do { \
3296 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3297 \
3298 if (rule < 0x100) /* old format */ \
3299 { \
3300 if (gref == 10) gref = 4; \
3301 if (nref == 10) nref = 4; \
3302 charbuf[idx] = 32 + gref * 9 + nref; \
3303 charbuf[idx + 1] = -1; \
3304 new_chars++; \
3305 } \
3306 else /* new format */ \
3307 { \
3308 charbuf[idx] = 32 + 81 + gref; \
3309 charbuf[idx + 1] = 32 + nref; \
3310 new_chars += 2; \
3311 } \
3312 } while (0)
3313
3314 /* Finish the current composition as invalid. */
3315
3316 static int
3317 finish_composition (int *charbuf, struct composition_status *cmp_status)
3318 {
3319 int idx = - cmp_status->length;
3320 int new_chars;
3321
3322 /* Recover the original ESC sequence */
3323 charbuf[idx++] = ISO_CODE_ESC;
3324 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3325 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3326 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3327 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3328 : '4');
3329 charbuf[idx++] = -2;
3330 charbuf[idx++] = 0;
3331 charbuf[idx++] = -1;
3332 new_chars = cmp_status->nchars;
3333 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3334 for (; idx < 0; idx++)
3335 {
3336 int elt = charbuf[idx];
3337
3338 if (elt == -2)
3339 {
3340 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3341 idx++;
3342 }
3343 else if (elt == -1)
3344 {
3345 charbuf[idx++] = ISO_CODE_ESC;
3346 charbuf[idx] = '0';
3347 new_chars += 2;
3348 }
3349 }
3350 cmp_status->state = COMPOSING_NO;
3351 return new_chars;
3352 }
3353
3354 /* If characters are under composition, finish the composition. */
3355 #define MAYBE_FINISH_COMPOSITION() \
3356 do { \
3357 if (cmp_status->state != COMPOSING_NO) \
3358 char_offset += finish_composition (charbuf, cmp_status); \
3359 } while (0)
3360
3361 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3362
3363 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3364 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3365 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3366 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3367
3368 Produce this annotation sequence now:
3369
3370 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3371 */
3372
3373 #define DECODE_COMPOSITION_START(c1) \
3374 do { \
3375 if (c1 == '0' \
3376 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3377 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3378 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3379 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3380 { \
3381 *charbuf++ = -1; \
3382 *charbuf++= -1; \
3383 cmp_status->state = COMPOSING_CHAR; \
3384 cmp_status->length += 2; \
3385 } \
3386 else \
3387 { \
3388 MAYBE_FINISH_COMPOSITION (); \
3389 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3390 : c1 == '2' ? COMPOSITION_WITH_RULE \
3391 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3392 : COMPOSITION_WITH_RULE_ALTCHARS); \
3393 cmp_status->state \
3394 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3395 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3396 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3397 cmp_status->nchars = cmp_status->ncomps = 0; \
3398 coding->annotated = 1; \
3399 } \
3400 } while (0)
3401
3402
3403 /* Handle composition end sequence ESC 1. */
3404
3405 #define DECODE_COMPOSITION_END() \
3406 do { \
3407 if (cmp_status->nchars == 0 \
3408 || ((cmp_status->state == COMPOSING_CHAR) \
3409 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3410 { \
3411 MAYBE_FINISH_COMPOSITION (); \
3412 goto invalid_code; \
3413 } \
3414 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3415 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3416 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3417 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3418 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3419 char_offset += cmp_status->nchars; \
3420 cmp_status->state = COMPOSING_NO; \
3421 } while (0)
3422
3423 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3424
3425 #define STORE_COMPOSITION_RULE(rule) \
3426 do { \
3427 *charbuf++ = -2; \
3428 *charbuf++ = rule; \
3429 cmp_status->length += 2; \
3430 cmp_status->state--; \
3431 } while (0)
3432
3433 /* Store a composed char or a component char C in charbuf, and update
3434 cmp_status. */
3435
3436 #define STORE_COMPOSITION_CHAR(c) \
3437 do { \
3438 *charbuf++ = (c); \
3439 cmp_status->length++; \
3440 if (cmp_status->state == COMPOSING_CHAR) \
3441 cmp_status->nchars++; \
3442 else \
3443 cmp_status->ncomps++; \
3444 if (cmp_status->method == COMPOSITION_WITH_RULE \
3445 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3446 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3447 cmp_status->state++; \
3448 } while (0)
3449
3450
3451 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3452
3453 static void
3454 decode_coding_iso_2022 (struct coding_system *coding)
3455 {
3456 const unsigned char *src = coding->source + coding->consumed;
3457 const unsigned char *src_end = coding->source + coding->src_bytes;
3458 const unsigned char *src_base;
3459 int *charbuf = coding->charbuf + coding->charbuf_used;
3460 /* We may produce two annotations (charset and composition) in one
3461 loop and one more charset annotation at the end. */
3462 int *charbuf_end
3463 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3464 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3465 bool multibytep = coding->src_multibyte;
3466 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3467 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3468 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3469 int charset_id_2, charset_id_3;
3470 struct charset *charset;
3471 int c;
3472 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3473 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3474 ptrdiff_t char_offset = coding->produced_char;
3475 ptrdiff_t last_offset = char_offset;
3476 int last_id = charset_ascii;
3477 bool eol_dos
3478 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3479 int byte_after_cr = -1;
3480 int i;
3481
3482 setup_iso_safe_charsets (attrs);
3483 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3484
3485 if (cmp_status->state != COMPOSING_NO)
3486 {
3487 if (charbuf_end - charbuf < cmp_status->length)
3488 emacs_abort ();
3489 for (i = 0; i < cmp_status->length; i++)
3490 *charbuf++ = cmp_status->carryover[i];
3491 coding->annotated = 1;
3492 }
3493
3494 while (1)
3495 {
3496 int c1, c2, c3;
3497
3498 src_base = src;
3499 consumed_chars_base = consumed_chars;
3500
3501 if (charbuf >= charbuf_end)
3502 {
3503 if (byte_after_cr >= 0)
3504 src_base--;
3505 break;
3506 }
3507
3508 if (byte_after_cr >= 0)
3509 c1 = byte_after_cr, byte_after_cr = -1;
3510 else
3511 ONE_MORE_BYTE (c1);
3512 if (c1 < 0)
3513 goto invalid_code;
3514
3515 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3516 {
3517 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3518 char_offset++;
3519 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3520 continue;
3521 }
3522
3523 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3524 {
3525 if (c1 == ISO_CODE_ESC)
3526 {
3527 if (src + 1 >= src_end)
3528 goto no_more_source;
3529 *charbuf++ = ISO_CODE_ESC;
3530 char_offset++;
3531 if (src[0] == '%' && src[1] == '@')
3532 {
3533 src += 2;
3534 consumed_chars += 2;
3535 char_offset += 2;
3536 /* We are sure charbuf can contain two more chars. */
3537 *charbuf++ = '%';
3538 *charbuf++ = '@';
3539 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3540 }
3541 }
3542 else
3543 {
3544 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3545 char_offset++;
3546 }
3547 continue;
3548 }
3549
3550 if ((cmp_status->state == COMPOSING_RULE
3551 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3552 && c1 != ISO_CODE_ESC)
3553 {
3554 int rule;
3555
3556 DECODE_COMPOSITION_RULE (rule);
3557 STORE_COMPOSITION_RULE (rule);
3558 continue;
3559 }
3560
3561 /* We produce at most one character. */
3562 switch (iso_code_class [c1])
3563 {
3564 case ISO_0x20_or_0x7F:
3565 if (charset_id_0 < 0
3566 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3567 /* This is SPACE or DEL. */
3568 charset = CHARSET_FROM_ID (charset_ascii);
3569 else
3570 charset = CHARSET_FROM_ID (charset_id_0);
3571 break;
3572
3573 case ISO_graphic_plane_0:
3574 if (charset_id_0 < 0)
3575 charset = CHARSET_FROM_ID (charset_ascii);
3576 else
3577 charset = CHARSET_FROM_ID (charset_id_0);
3578 break;
3579
3580 case ISO_0xA0_or_0xFF:
3581 if (charset_id_1 < 0
3582 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3583 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3584 goto invalid_code;
3585 /* This is a graphic character, we fall down ... */
3586
3587 case ISO_graphic_plane_1:
3588 if (charset_id_1 < 0)
3589 goto invalid_code;
3590 charset = CHARSET_FROM_ID (charset_id_1);
3591 break;
3592
3593 case ISO_control_0:
3594 if (eol_dos && c1 == '\r')
3595 ONE_MORE_BYTE (byte_after_cr);
3596 MAYBE_FINISH_COMPOSITION ();
3597 charset = CHARSET_FROM_ID (charset_ascii);
3598 break;
3599
3600 case ISO_control_1:
3601 goto invalid_code;
3602
3603 case ISO_shift_out:
3604 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3605 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3606 goto invalid_code;
3607 CODING_ISO_INVOCATION (coding, 0) = 1;
3608 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3609 continue;
3610
3611 case ISO_shift_in:
3612 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3613 goto invalid_code;
3614 CODING_ISO_INVOCATION (coding, 0) = 0;
3615 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3616 continue;
3617
3618 case ISO_single_shift_2_7:
3619 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3620 goto invalid_code;
3621 case ISO_single_shift_2:
3622 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3623 goto invalid_code;
3624 /* SS2 is handled as an escape sequence of ESC 'N' */
3625 c1 = 'N';
3626 goto label_escape_sequence;
3627
3628 case ISO_single_shift_3:
3629 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3630 goto invalid_code;
3631 /* SS2 is handled as an escape sequence of ESC 'O' */
3632 c1 = 'O';
3633 goto label_escape_sequence;
3634
3635 case ISO_control_sequence_introducer:
3636 /* CSI is handled as an escape sequence of ESC '[' ... */
3637 c1 = '[';
3638 goto label_escape_sequence;
3639
3640 case ISO_escape:
3641 ONE_MORE_BYTE (c1);
3642 label_escape_sequence:
3643 /* Escape sequences handled here are invocation,
3644 designation, direction specification, and character
3645 composition specification. */
3646 switch (c1)
3647 {
3648 case '&': /* revision of following character set */
3649 ONE_MORE_BYTE (c1);
3650 if (!(c1 >= '@' && c1 <= '~'))
3651 goto invalid_code;
3652 ONE_MORE_BYTE (c1);
3653 if (c1 != ISO_CODE_ESC)
3654 goto invalid_code;
3655 ONE_MORE_BYTE (c1);
3656 goto label_escape_sequence;
3657
3658 case '$': /* designation of 2-byte character set */
3659 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3660 goto invalid_code;
3661 {
3662 int reg, chars96;
3663
3664 ONE_MORE_BYTE (c1);
3665 if (c1 >= '@' && c1 <= 'B')
3666 { /* designation of JISX0208.1978, GB2312.1980,
3667 or JISX0208.1980 */
3668 reg = 0, chars96 = 0;
3669 }
3670 else if (c1 >= 0x28 && c1 <= 0x2B)
3671 { /* designation of DIMENSION2_CHARS94 character set */
3672 reg = c1 - 0x28, chars96 = 0;
3673 ONE_MORE_BYTE (c1);
3674 }
3675 else if (c1 >= 0x2C && c1 <= 0x2F)
3676 { /* designation of DIMENSION2_CHARS96 character set */
3677 reg = c1 - 0x2C, chars96 = 1;
3678 ONE_MORE_BYTE (c1);
3679 }
3680 else
3681 goto invalid_code;
3682 DECODE_DESIGNATION (reg, 2, chars96, c1);
3683 /* We must update these variables now. */
3684 if (reg == 0)
3685 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3686 else if (reg == 1)
3687 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3688 if (chars96 < 0)
3689 goto invalid_code;
3690 }
3691 continue;
3692
3693 case 'n': /* invocation of locking-shift-2 */
3694 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3695 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3696 goto invalid_code;
3697 CODING_ISO_INVOCATION (coding, 0) = 2;
3698 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3699 continue;
3700
3701 case 'o': /* invocation of locking-shift-3 */
3702 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3703 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3704 goto invalid_code;
3705 CODING_ISO_INVOCATION (coding, 0) = 3;
3706 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3707 continue;
3708
3709 case 'N': /* invocation of single-shift-2 */
3710 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3711 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3712 goto invalid_code;
3713 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3714 if (charset_id_2 < 0)
3715 charset = CHARSET_FROM_ID (charset_ascii);
3716 else
3717 charset = CHARSET_FROM_ID (charset_id_2);
3718 ONE_MORE_BYTE (c1);
3719 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3720 goto invalid_code;
3721 break;
3722
3723 case 'O': /* invocation of single-shift-3 */
3724 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3725 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3726 goto invalid_code;
3727 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3728 if (charset_id_3 < 0)
3729 charset = CHARSET_FROM_ID (charset_ascii);
3730 else
3731 charset = CHARSET_FROM_ID (charset_id_3);
3732 ONE_MORE_BYTE (c1);
3733 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3734 goto invalid_code;
3735 break;
3736
3737 case '0': case '2': case '3': case '4': /* start composition */
3738 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3739 goto invalid_code;
3740 if (last_id != charset_ascii)
3741 {
3742 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3743 last_id = charset_ascii;
3744 last_offset = char_offset;
3745 }
3746 DECODE_COMPOSITION_START (c1);
3747 continue;
3748
3749 case '1': /* end composition */
3750 if (cmp_status->state == COMPOSING_NO)
3751 goto invalid_code;
3752 DECODE_COMPOSITION_END ();
3753 continue;
3754
3755 case '[': /* specification of direction */
3756 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3757 goto invalid_code;
3758 /* For the moment, nested direction is not supported.
3759 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3760 left-to-right, and nonzero means right-to-left. */
3761 ONE_MORE_BYTE (c1);
3762 switch (c1)
3763 {
3764 case ']': /* end of the current direction */
3765 coding->mode &= ~CODING_MODE_DIRECTION;
3766
3767 case '0': /* end of the current direction */
3768 case '1': /* start of left-to-right direction */
3769 ONE_MORE_BYTE (c1);
3770 if (c1 == ']')
3771 coding->mode &= ~CODING_MODE_DIRECTION;
3772 else
3773 goto invalid_code;
3774 break;
3775
3776 case '2': /* start of right-to-left direction */
3777 ONE_MORE_BYTE (c1);
3778 if (c1 == ']')
3779 coding->mode |= CODING_MODE_DIRECTION;
3780 else
3781 goto invalid_code;
3782 break;
3783
3784 default:
3785 goto invalid_code;
3786 }
3787 continue;
3788
3789 case '%':
3790 ONE_MORE_BYTE (c1);
3791 if (c1 == '/')
3792 {
3793 /* CTEXT extended segment:
3794 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3795 We keep these bytes as is for the moment.
3796 They may be decoded by post-read-conversion. */
3797 int dim, M, L;
3798 int size;
3799
3800 ONE_MORE_BYTE (dim);
3801 if (dim < '0' || dim > '4')
3802 goto invalid_code;
3803 ONE_MORE_BYTE (M);
3804 if (M < 128)
3805 goto invalid_code;
3806 ONE_MORE_BYTE (L);
3807 if (L < 128)
3808 goto invalid_code;
3809 size = ((M - 128) * 128) + (L - 128);
3810 if (charbuf + 6 > charbuf_end)
3811 goto break_loop;
3812 *charbuf++ = ISO_CODE_ESC;
3813 *charbuf++ = '%';
3814 *charbuf++ = '/';
3815 *charbuf++ = dim;
3816 *charbuf++ = BYTE8_TO_CHAR (M);
3817 *charbuf++ = BYTE8_TO_CHAR (L);
3818 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3819 }
3820 else if (c1 == 'G')
3821 {
3822 /* XFree86 extension for embedding UTF-8 in CTEXT:
3823 ESC % G --UTF-8-BYTES-- ESC % @
3824 We keep these bytes as is for the moment.
3825 They may be decoded by post-read-conversion. */
3826 if (charbuf + 3 > charbuf_end)
3827 goto break_loop;
3828 *charbuf++ = ISO_CODE_ESC;
3829 *charbuf++ = '%';
3830 *charbuf++ = 'G';
3831 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3832 }
3833 else
3834 goto invalid_code;
3835 continue;
3836 break;
3837
3838 default:
3839 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3840 goto invalid_code;
3841 {
3842 int reg, chars96;
3843
3844 if (c1 >= 0x28 && c1 <= 0x2B)
3845 { /* designation of DIMENSION1_CHARS94 character set */
3846 reg = c1 - 0x28, chars96 = 0;
3847 ONE_MORE_BYTE (c1);
3848 }
3849 else if (c1 >= 0x2C && c1 <= 0x2F)
3850 { /* designation of DIMENSION1_CHARS96 character set */
3851 reg = c1 - 0x2C, chars96 = 1;
3852 ONE_MORE_BYTE (c1);
3853 }
3854 else
3855 goto invalid_code;
3856 DECODE_DESIGNATION (reg, 1, chars96, c1);
3857 /* We must update these variables now. */
3858 if (reg == 0)
3859 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3860 else if (reg == 1)
3861 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3862 if (chars96 < 0)
3863 goto invalid_code;
3864 }
3865 continue;
3866 }
3867 break;
3868
3869 default:
3870 emacs_abort ();
3871 }
3872
3873 if (cmp_status->state == COMPOSING_NO
3874 && charset->id != charset_ascii
3875 && last_id != charset->id)
3876 {
3877 if (last_id != charset_ascii)
3878 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3879 last_id = charset->id;
3880 last_offset = char_offset;
3881 }
3882
3883 /* Now we know CHARSET and 1st position code C1 of a character.
3884 Produce a decoded character while getting 2nd and 3rd
3885 position codes C2, C3 if necessary. */
3886 if (CHARSET_DIMENSION (charset) > 1)
3887 {
3888 ONE_MORE_BYTE (c2);
3889 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3890 || ((c1 & 0x80) != (c2 & 0x80)))
3891 /* C2 is not in a valid range. */
3892 goto invalid_code;
3893 if (CHARSET_DIMENSION (charset) == 2)
3894 c1 = (c1 << 8) | c2;
3895 else
3896 {
3897 ONE_MORE_BYTE (c3);
3898 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3899 || ((c1 & 0x80) != (c3 & 0x80)))
3900 /* C3 is not in a valid range. */
3901 goto invalid_code;
3902 c1 = (c1 << 16) | (c2 << 8) | c2;
3903 }
3904 }
3905 c1 &= 0x7F7F7F;
3906 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3907 if (c < 0)
3908 {
3909 MAYBE_FINISH_COMPOSITION ();
3910 for (; src_base < src; src_base++, char_offset++)
3911 {
3912 if (ASCII_BYTE_P (*src_base))
3913 *charbuf++ = *src_base;
3914 else
3915 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3916 }
3917 }
3918 else if (cmp_status->state == COMPOSING_NO)
3919 {
3920 *charbuf++ = c;
3921 char_offset++;
3922 }
3923 else if ((cmp_status->state == COMPOSING_CHAR
3924 ? cmp_status->nchars
3925 : cmp_status->ncomps)
3926 >= MAX_COMPOSITION_COMPONENTS)
3927 {
3928 /* Too long composition. */
3929 MAYBE_FINISH_COMPOSITION ();
3930 *charbuf++ = c;
3931 char_offset++;
3932 }
3933 else
3934 STORE_COMPOSITION_CHAR (c);
3935 continue;
3936
3937 invalid_code:
3938 MAYBE_FINISH_COMPOSITION ();
3939 src = src_base;
3940 consumed_chars = consumed_chars_base;
3941 ONE_MORE_BYTE (c);
3942 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3943 char_offset++;
3944 coding->errors++;
3945 /* Reset the invocation and designation status to the safest
3946 one; i.e. designate ASCII to the graphic register 0, and
3947 invoke that register to the graphic plane 0. This typically
3948 helps the case that an designation sequence for ASCII "ESC (
3949 B" is somehow broken (e.g. broken by a newline). */
3950 CODING_ISO_INVOCATION (coding, 0) = 0;
3951 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3952 charset_id_0 = charset_ascii;
3953 continue;
3954
3955 break_loop:
3956 break;
3957 }
3958
3959 no_more_source:
3960 if (cmp_status->state != COMPOSING_NO)
3961 {
3962 if (coding->mode & CODING_MODE_LAST_BLOCK)
3963 MAYBE_FINISH_COMPOSITION ();
3964 else
3965 {
3966 charbuf -= cmp_status->length;
3967 for (i = 0; i < cmp_status->length; i++)
3968 cmp_status->carryover[i] = charbuf[i];
3969 }
3970 }
3971 else if (last_id != charset_ascii)
3972 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3973 coding->consumed_char += consumed_chars_base;
3974 coding->consumed = src_base - coding->source;
3975 coding->charbuf_used = charbuf - coding->charbuf;
3976 }
3977
3978
3979 /* ISO2022 encoding stuff. */
3980
3981 /*
3982 It is not enough to say just "ISO2022" on encoding, we have to
3983 specify more details. In Emacs, each coding system of ISO2022
3984 variant has the following specifications:
3985 1. Initial designation to G0 thru G3.
3986 2. Allows short-form designation?
3987 3. ASCII should be designated to G0 before control characters?
3988 4. ASCII should be designated to G0 at end of line?
3989 5. 7-bit environment or 8-bit environment?
3990 6. Use locking-shift?
3991 7. Use Single-shift?
3992 And the following two are only for Japanese:
3993 8. Use ASCII in place of JIS0201-1976-Roman?
3994 9. Use JISX0208-1983 in place of JISX0208-1978?
3995 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3996 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3997 details.
3998 */
3999
4000 /* Produce codes (escape sequence) for designating CHARSET to graphic
4001 register REG at DST, and increment DST. If <final-char> of CHARSET is
4002 '@', 'A', or 'B' and the coding system CODING allows, produce
4003 designation sequence of short-form. */
4004
4005 #define ENCODE_DESIGNATION(charset, reg, coding) \
4006 do { \
4007 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4008 const char *intermediate_char_94 = "()*+"; \
4009 const char *intermediate_char_96 = ",-./"; \
4010 int revision = -1; \
4011 \
4012 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4013 revision = CHARSET_ISO_REVISION (charset); \
4014 \
4015 if (revision >= 0) \
4016 { \
4017 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4018 EMIT_ONE_BYTE ('@' + revision); \
4019 } \
4020 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4021 if (CHARSET_DIMENSION (charset) == 1) \
4022 { \
4023 int b; \
4024 if (! CHARSET_ISO_CHARS_96 (charset)) \
4025 b = intermediate_char_94[reg]; \
4026 else \
4027 b = intermediate_char_96[reg]; \
4028 EMIT_ONE_ASCII_BYTE (b); \
4029 } \
4030 else \
4031 { \
4032 EMIT_ONE_ASCII_BYTE ('$'); \
4033 if (! CHARSET_ISO_CHARS_96 (charset)) \
4034 { \
4035 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4036 || reg != 0 \
4037 || final_char < '@' || final_char > 'B') \
4038 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4039 } \
4040 else \
4041 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4042 } \
4043 EMIT_ONE_ASCII_BYTE (final_char); \
4044 \
4045 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4046 } while (0)
4047
4048
4049 /* The following two macros produce codes (control character or escape
4050 sequence) for ISO2022 single-shift functions (single-shift-2 and
4051 single-shift-3). */
4052
4053 #define ENCODE_SINGLE_SHIFT_2 \
4054 do { \
4055 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4056 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4057 else \
4058 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4059 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4060 } while (0)
4061
4062
4063 #define ENCODE_SINGLE_SHIFT_3 \
4064 do { \
4065 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4066 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4067 else \
4068 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4069 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4070 } while (0)
4071
4072
4073 /* The following four macros produce codes (control character or
4074 escape sequence) for ISO2022 locking-shift functions (shift-in,
4075 shift-out, locking-shift-2, and locking-shift-3). */
4076
4077 #define ENCODE_SHIFT_IN \
4078 do { \
4079 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4080 CODING_ISO_INVOCATION (coding, 0) = 0; \
4081 } while (0)
4082
4083
4084 #define ENCODE_SHIFT_OUT \
4085 do { \
4086 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4087 CODING_ISO_INVOCATION (coding, 0) = 1; \
4088 } while (0)
4089
4090
4091 #define ENCODE_LOCKING_SHIFT_2 \
4092 do { \
4093 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4094 CODING_ISO_INVOCATION (coding, 0) = 2; \
4095 } while (0)
4096
4097
4098 #define ENCODE_LOCKING_SHIFT_3 \
4099 do { \
4100 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4101 CODING_ISO_INVOCATION (coding, 0) = 3; \
4102 } while (0)
4103
4104
4105 /* Produce codes for a DIMENSION1 character whose character set is
4106 CHARSET and whose position-code is C1. Designation and invocation
4107 sequences are also produced in advance if necessary. */
4108
4109 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4110 do { \
4111 int id = CHARSET_ID (charset); \
4112 \
4113 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4114 && id == charset_ascii) \
4115 { \
4116 id = charset_jisx0201_roman; \
4117 charset = CHARSET_FROM_ID (id); \
4118 } \
4119 \
4120 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4121 { \
4122 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4123 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4124 else \
4125 EMIT_ONE_BYTE (c1 | 0x80); \
4126 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4127 break; \
4128 } \
4129 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4130 { \
4131 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4132 break; \
4133 } \
4134 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4135 { \
4136 EMIT_ONE_BYTE (c1 | 0x80); \
4137 break; \
4138 } \
4139 else \
4140 /* Since CHARSET is not yet invoked to any graphic planes, we \
4141 must invoke it, or, at first, designate it to some graphic \
4142 register. Then repeat the loop to actually produce the \
4143 character. */ \
4144 dst = encode_invocation_designation (charset, coding, dst, \
4145 &produced_chars); \
4146 } while (1)
4147
4148
4149 /* Produce codes for a DIMENSION2 character whose character set is
4150 CHARSET and whose position-codes are C1 and C2. Designation and
4151 invocation codes are also produced in advance if necessary. */
4152
4153 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4154 do { \
4155 int id = CHARSET_ID (charset); \
4156 \
4157 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4158 && id == charset_jisx0208) \
4159 { \
4160 id = charset_jisx0208_1978; \
4161 charset = CHARSET_FROM_ID (id); \
4162 } \
4163 \
4164 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4165 { \
4166 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4167 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4168 else \
4169 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4170 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4171 break; \
4172 } \
4173 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4174 { \
4175 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4176 break; \
4177 } \
4178 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4179 { \
4180 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4181 break; \
4182 } \
4183 else \
4184 /* Since CHARSET is not yet invoked to any graphic planes, we \
4185 must invoke it, or, at first, designate it to some graphic \
4186 register. Then repeat the loop to actually produce the \
4187 character. */ \
4188 dst = encode_invocation_designation (charset, coding, dst, \
4189 &produced_chars); \
4190 } while (1)
4191
4192
4193 #define ENCODE_ISO_CHARACTER(charset, c) \
4194 do { \
4195 unsigned code; \
4196 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4197 \
4198 if (CHARSET_DIMENSION (charset) == 1) \
4199 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4200 else \
4201 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4202 } while (0)
4203
4204
4205 /* Produce designation and invocation codes at a place pointed by DST
4206 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4207 Return new DST. */
4208
4209 static unsigned char *
4210 encode_invocation_designation (struct charset *charset,
4211 struct coding_system *coding,
4212 unsigned char *dst, ptrdiff_t *p_nchars)
4213 {
4214 bool multibytep = coding->dst_multibyte;
4215 ptrdiff_t produced_chars = *p_nchars;
4216 int reg; /* graphic register number */
4217 int id = CHARSET_ID (charset);
4218
4219 /* At first, check designations. */
4220 for (reg = 0; reg < 4; reg++)
4221 if (id == CODING_ISO_DESIGNATION (coding, reg))
4222 break;
4223
4224 if (reg >= 4)
4225 {
4226 /* CHARSET is not yet designated to any graphic registers. */
4227 /* At first check the requested designation. */
4228 reg = CODING_ISO_REQUEST (coding, id);
4229 if (reg < 0)
4230 /* Since CHARSET requests no special designation, designate it
4231 to graphic register 0. */
4232 reg = 0;
4233
4234 ENCODE_DESIGNATION (charset, reg, coding);
4235 }
4236
4237 if (CODING_ISO_INVOCATION (coding, 0) != reg
4238 && CODING_ISO_INVOCATION (coding, 1) != reg)
4239 {
4240 /* Since the graphic register REG is not invoked to any graphic
4241 planes, invoke it to graphic plane 0. */
4242 switch (reg)
4243 {
4244 case 0: /* graphic register 0 */
4245 ENCODE_SHIFT_IN;
4246 break;
4247
4248 case 1: /* graphic register 1 */
4249 ENCODE_SHIFT_OUT;
4250 break;
4251
4252 case 2: /* graphic register 2 */
4253 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4254 ENCODE_SINGLE_SHIFT_2;
4255 else
4256 ENCODE_LOCKING_SHIFT_2;
4257 break;
4258
4259 case 3: /* graphic register 3 */
4260 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4261 ENCODE_SINGLE_SHIFT_3;
4262 else
4263 ENCODE_LOCKING_SHIFT_3;
4264 break;
4265 }
4266 }
4267
4268 *p_nchars = produced_chars;
4269 return dst;
4270 }
4271
4272
4273 /* Produce codes for designation and invocation to reset the graphic
4274 planes and registers to initial state. */
4275 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4276 do { \
4277 int reg; \
4278 struct charset *charset; \
4279 \
4280 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4281 ENCODE_SHIFT_IN; \
4282 for (reg = 0; reg < 4; reg++) \
4283 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4284 && (CODING_ISO_DESIGNATION (coding, reg) \
4285 != CODING_ISO_INITIAL (coding, reg))) \
4286 { \
4287 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4288 ENCODE_DESIGNATION (charset, reg, coding); \
4289 } \
4290 } while (0)
4291
4292
4293 /* Produce designation sequences of charsets in the line started from
4294 CHARBUF to a place pointed by DST, and return the number of
4295 produced bytes. DST should not directly point a buffer text area
4296 which may be relocated by char_charset call.
4297
4298 If the current block ends before any end-of-line, we may fail to
4299 find all the necessary designations. */
4300
4301 static ptrdiff_t
4302 encode_designation_at_bol (struct coding_system *coding,
4303 int *charbuf, int *charbuf_end,
4304 unsigned char *dst)
4305 {
4306 unsigned char *orig = dst;
4307 struct charset *charset;
4308 /* Table of charsets to be designated to each graphic register. */
4309 int r[4];
4310 int c, found = 0, reg;
4311 ptrdiff_t produced_chars = 0;
4312 bool multibytep = coding->dst_multibyte;
4313 Lisp_Object attrs;
4314 Lisp_Object charset_list;
4315
4316 attrs = CODING_ID_ATTRS (coding->id);
4317 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4318 if (EQ (charset_list, Qiso_2022))
4319 charset_list = Viso_2022_charset_list;
4320
4321 for (reg = 0; reg < 4; reg++)
4322 r[reg] = -1;
4323
4324 while (charbuf < charbuf_end && found < 4)
4325 {
4326 int id;
4327
4328 c = *charbuf++;
4329 if (c == '\n')
4330 break;
4331 charset = char_charset (c, charset_list, NULL);
4332 id = CHARSET_ID (charset);
4333 reg = CODING_ISO_REQUEST (coding, id);
4334 if (reg >= 0 && r[reg] < 0)
4335 {
4336 found++;
4337 r[reg] = id;
4338 }
4339 }
4340
4341 if (found)
4342 {
4343 for (reg = 0; reg < 4; reg++)
4344 if (r[reg] >= 0
4345 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4346 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4347 }
4348
4349 return dst - orig;
4350 }
4351
4352 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4353
4354 static bool
4355 encode_coding_iso_2022 (struct coding_system *coding)
4356 {
4357 bool multibytep = coding->dst_multibyte;
4358 int *charbuf = coding->charbuf;
4359 int *charbuf_end = charbuf + coding->charbuf_used;
4360 unsigned char *dst = coding->destination + coding->produced;
4361 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4362 int safe_room = 16;
4363 bool bol_designation
4364 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4365 && CODING_ISO_BOL (coding));
4366 ptrdiff_t produced_chars = 0;
4367 Lisp_Object attrs, eol_type, charset_list;
4368 bool ascii_compatible;
4369 int c;
4370 int preferred_charset_id = -1;
4371
4372 CODING_GET_INFO (coding, attrs, charset_list);
4373 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4374 if (VECTORP (eol_type))
4375 eol_type = Qunix;
4376
4377 setup_iso_safe_charsets (attrs);
4378 /* Charset list may have been changed. */
4379 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4380 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4381
4382 ascii_compatible
4383 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4384 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4385 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4386
4387 while (charbuf < charbuf_end)
4388 {
4389 ASSURE_DESTINATION (safe_room);
4390
4391 if (bol_designation)
4392 {
4393 /* We have to produce designation sequences if any now. */
4394 unsigned char desig_buf[16];
4395 int nbytes;
4396 ptrdiff_t offset;
4397
4398 charset_map_loaded = 0;
4399 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4400 desig_buf);
4401 if (charset_map_loaded
4402 && (offset = coding_change_destination (coding)))
4403 {
4404 dst += offset;
4405 dst_end += offset;
4406 }
4407 memcpy (dst, desig_buf, nbytes);
4408 dst += nbytes;
4409 /* We are sure that designation sequences are all ASCII bytes. */
4410 produced_chars += nbytes;
4411 bol_designation = 0;
4412 ASSURE_DESTINATION (safe_room);
4413 }
4414
4415 c = *charbuf++;
4416
4417 if (c < 0)
4418 {
4419 /* Handle an annotation. */
4420 switch (*charbuf)
4421 {
4422 case CODING_ANNOTATE_COMPOSITION_MASK:
4423 /* Not yet implemented. */
4424 break;
4425 case CODING_ANNOTATE_CHARSET_MASK:
4426 preferred_charset_id = charbuf[2];
4427 if (preferred_charset_id >= 0
4428 && NILP (Fmemq (make_number (preferred_charset_id),
4429 charset_list)))
4430 preferred_charset_id = -1;
4431 break;
4432 default:
4433 emacs_abort ();
4434 }
4435 charbuf += -c - 1;
4436 continue;
4437 }
4438
4439 /* Now encode the character C. */
4440 if (c < 0x20 || c == 0x7F)
4441 {
4442 if (c == '\n'
4443 || (c == '\r' && EQ (eol_type, Qmac)))
4444 {
4445 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4446 ENCODE_RESET_PLANE_AND_REGISTER ();
4447 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4448 {
4449 int i;
4450
4451 for (i = 0; i < 4; i++)
4452 CODING_ISO_DESIGNATION (coding, i)
4453 = CODING_ISO_INITIAL (coding, i);
4454 }
4455 bol_designation = ((CODING_ISO_FLAGS (coding)
4456 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4457 != 0);
4458 }
4459 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4460 ENCODE_RESET_PLANE_AND_REGISTER ();
4461 EMIT_ONE_ASCII_BYTE (c);
4462 }
4463 else if (ASCII_CHAR_P (c))
4464 {
4465 if (ascii_compatible)
4466 EMIT_ONE_ASCII_BYTE (c);
4467 else
4468 {
4469 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4470 ENCODE_ISO_CHARACTER (charset, c);
4471 }
4472 }
4473 else if (CHAR_BYTE8_P (c))
4474 {
4475 c = CHAR_TO_BYTE8 (c);
4476 EMIT_ONE_BYTE (c);
4477 }
4478 else
4479 {
4480 struct charset *charset;
4481
4482 if (preferred_charset_id >= 0)
4483 {
4484 bool result;
4485
4486 charset = CHARSET_FROM_ID (preferred_charset_id);
4487 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4488 if (! result)
4489 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4490 NULL, charset);
4491 }
4492 else
4493 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4494 NULL, charset);
4495 if (!charset)
4496 {
4497 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4498 {
4499 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4500 charset = CHARSET_FROM_ID (charset_ascii);
4501 }
4502 else
4503 {
4504 c = coding->default_char;
4505 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4506 charset_list, NULL, charset);
4507 }
4508 }
4509 ENCODE_ISO_CHARACTER (charset, c);
4510 }
4511 }
4512
4513 if (coding->mode & CODING_MODE_LAST_BLOCK
4514 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4515 {
4516 ASSURE_DESTINATION (safe_room);
4517 ENCODE_RESET_PLANE_AND_REGISTER ();
4518 }
4519 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4520 CODING_ISO_BOL (coding) = bol_designation;
4521 coding->produced_char += produced_chars;
4522 coding->produced = dst - coding->destination;
4523 return 0;
4524 }
4525
4526 \f
4527 /*** 8,9. SJIS and BIG5 handlers ***/
4528
4529 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4530 quite widely. So, for the moment, Emacs supports them in the bare
4531 C code. But, in the future, they may be supported only by CCL. */
4532
4533 /* SJIS is a coding system encoding three character sets: ASCII, right
4534 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4535 as is. A character of charset katakana-jisx0201 is encoded by
4536 "position-code + 0x80". A character of charset japanese-jisx0208
4537 is encoded in 2-byte but two position-codes are divided and shifted
4538 so that it fit in the range below.
4539
4540 --- CODE RANGE of SJIS ---
4541 (character set) (range)
4542 ASCII 0x00 .. 0x7F
4543 KATAKANA-JISX0201 0xA0 .. 0xDF
4544 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4545 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4546 -------------------------------
4547
4548 */
4549
4550 /* BIG5 is a coding system encoding two character sets: ASCII and
4551 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4552 character set and is encoded in two-byte.
4553
4554 --- CODE RANGE of BIG5 ---
4555 (character set) (range)
4556 ASCII 0x00 .. 0x7F
4557 Big5 (1st byte) 0xA1 .. 0xFE
4558 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4559 --------------------------
4560
4561 */
4562
4563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4564 Return true if a text is encoded in SJIS. */
4565
4566 static bool
4567 detect_coding_sjis (struct coding_system *coding,
4568 struct coding_detection_info *detect_info)
4569 {
4570 const unsigned char *src = coding->source, *src_base;
4571 const unsigned char *src_end = coding->source + coding->src_bytes;
4572 bool multibytep = coding->src_multibyte;
4573 ptrdiff_t consumed_chars = 0;
4574 int found = 0;
4575 int c;
4576 Lisp_Object attrs, charset_list;
4577 int max_first_byte_of_2_byte_code;
4578
4579 CODING_GET_INFO (coding, attrs, charset_list);
4580 max_first_byte_of_2_byte_code
4581 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4582
4583 detect_info->checked |= CATEGORY_MASK_SJIS;
4584 /* A coding system of this category is always ASCII compatible. */
4585 src += coding->head_ascii;
4586
4587 while (1)
4588 {
4589 src_base = src;
4590 ONE_MORE_BYTE (c);
4591 if (c < 0x80)
4592 continue;
4593 if ((c >= 0x81 && c <= 0x9F)
4594 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4595 {
4596 ONE_MORE_BYTE (c);
4597 if (c < 0x40 || c == 0x7F || c > 0xFC)
4598 break;
4599 found = CATEGORY_MASK_SJIS;
4600 }
4601 else if (c >= 0xA0 && c < 0xE0)
4602 found = CATEGORY_MASK_SJIS;
4603 else
4604 break;
4605 }
4606 detect_info->rejected |= CATEGORY_MASK_SJIS;
4607 return 0;
4608
4609 no_more_source:
4610 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4611 {
4612 detect_info->rejected |= CATEGORY_MASK_SJIS;
4613 return 0;
4614 }
4615 detect_info->found |= found;
4616 return 1;
4617 }
4618
4619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4620 Return true if a text is encoded in BIG5. */
4621
4622 static bool
4623 detect_coding_big5 (struct coding_system *coding,
4624 struct coding_detection_info *detect_info)
4625 {
4626 const unsigned char *src = coding->source, *src_base;
4627 const unsigned char *src_end = coding->source + coding->src_bytes;
4628 bool multibytep = coding->src_multibyte;
4629 ptrdiff_t consumed_chars = 0;
4630 int found = 0;
4631 int c;
4632
4633 detect_info->checked |= CATEGORY_MASK_BIG5;
4634 /* A coding system of this category is always ASCII compatible. */
4635 src += coding->head_ascii;
4636
4637 while (1)
4638 {
4639 src_base = src;
4640 ONE_MORE_BYTE (c);
4641 if (c < 0x80)
4642 continue;
4643 if (c >= 0xA1)
4644 {
4645 ONE_MORE_BYTE (c);
4646 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4647 return 0;
4648 found = CATEGORY_MASK_BIG5;
4649 }
4650 else
4651 break;
4652 }
4653 detect_info->rejected |= CATEGORY_MASK_BIG5;
4654 return 0;
4655
4656 no_more_source:
4657 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4658 {
4659 detect_info->rejected |= CATEGORY_MASK_BIG5;
4660 return 0;
4661 }
4662 detect_info->found |= found;
4663 return 1;
4664 }
4665
4666 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4667
4668 static void
4669 decode_coding_sjis (struct coding_system *coding)
4670 {
4671 const unsigned char *src = coding->source + coding->consumed;
4672 const unsigned char *src_end = coding->source + coding->src_bytes;
4673 const unsigned char *src_base;
4674 int *charbuf = coding->charbuf + coding->charbuf_used;
4675 /* We may produce one charset annotation in one loop and one more at
4676 the end. */
4677 int *charbuf_end
4678 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4679 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4680 bool multibytep = coding->src_multibyte;
4681 struct charset *charset_roman, *charset_kanji, *charset_kana;
4682 struct charset *charset_kanji2;
4683 Lisp_Object attrs, charset_list, val;
4684 ptrdiff_t char_offset = coding->produced_char;
4685 ptrdiff_t last_offset = char_offset;
4686 int last_id = charset_ascii;
4687 bool eol_dos
4688 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4689 int byte_after_cr = -1;
4690
4691 CODING_GET_INFO (coding, attrs, charset_list);
4692
4693 val = charset_list;
4694 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4695 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4696 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4697 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4698
4699 while (1)
4700 {
4701 int c, c1;
4702 struct charset *charset;
4703
4704 src_base = src;
4705 consumed_chars_base = consumed_chars;
4706
4707 if (charbuf >= charbuf_end)
4708 {
4709 if (byte_after_cr >= 0)
4710 src_base--;
4711 break;
4712 }
4713
4714 if (byte_after_cr >= 0)
4715 c = byte_after_cr, byte_after_cr = -1;
4716 else
4717 ONE_MORE_BYTE (c);
4718 if (c < 0)
4719 goto invalid_code;
4720 if (c < 0x80)
4721 {
4722 if (eol_dos && c == '\r')
4723 ONE_MORE_BYTE (byte_after_cr);
4724 charset = charset_roman;
4725 }
4726 else if (c == 0x80 || c == 0xA0)
4727 goto invalid_code;
4728 else if (c >= 0xA1 && c <= 0xDF)
4729 {
4730 /* SJIS -> JISX0201-Kana */
4731 c &= 0x7F;
4732 charset = charset_kana;
4733 }
4734 else if (c <= 0xEF)
4735 {
4736 /* SJIS -> JISX0208 */
4737 ONE_MORE_BYTE (c1);
4738 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4739 goto invalid_code;
4740 c = (c << 8) | c1;
4741 SJIS_TO_JIS (c);
4742 charset = charset_kanji;
4743 }
4744 else if (c <= 0xFC && charset_kanji2)
4745 {
4746 /* SJIS -> JISX0213-2 */
4747 ONE_MORE_BYTE (c1);
4748 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4749 goto invalid_code;
4750 c = (c << 8) | c1;
4751 SJIS_TO_JIS2 (c);
4752 charset = charset_kanji2;
4753 }
4754 else
4755 goto invalid_code;
4756 if (charset->id != charset_ascii
4757 && last_id != charset->id)
4758 {
4759 if (last_id != charset_ascii)
4760 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4761 last_id = charset->id;
4762 last_offset = char_offset;
4763 }
4764 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4765 *charbuf++ = c;
4766 char_offset++;
4767 continue;
4768
4769 invalid_code:
4770 src = src_base;
4771 consumed_chars = consumed_chars_base;
4772 ONE_MORE_BYTE (c);
4773 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4774 char_offset++;
4775 coding->errors++;
4776 }
4777
4778 no_more_source:
4779 if (last_id != charset_ascii)
4780 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4781 coding->consumed_char += consumed_chars_base;
4782 coding->consumed = src_base - coding->source;
4783 coding->charbuf_used = charbuf - coding->charbuf;
4784 }
4785
4786 static void
4787 decode_coding_big5 (struct coding_system *coding)
4788 {
4789 const unsigned char *src = coding->source + coding->consumed;
4790 const unsigned char *src_end = coding->source + coding->src_bytes;
4791 const unsigned char *src_base;
4792 int *charbuf = coding->charbuf + coding->charbuf_used;
4793 /* We may produce one charset annotation in one loop and one more at
4794 the end. */
4795 int *charbuf_end
4796 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4797 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4798 bool multibytep = coding->src_multibyte;
4799 struct charset *charset_roman, *charset_big5;
4800 Lisp_Object attrs, charset_list, val;
4801 ptrdiff_t char_offset = coding->produced_char;
4802 ptrdiff_t last_offset = char_offset;
4803 int last_id = charset_ascii;
4804 bool eol_dos
4805 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4806 int byte_after_cr = -1;
4807
4808 CODING_GET_INFO (coding, attrs, charset_list);
4809 val = charset_list;
4810 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4811 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4812
4813 while (1)
4814 {
4815 int c, c1;
4816 struct charset *charset;
4817
4818 src_base = src;
4819 consumed_chars_base = consumed_chars;
4820
4821 if (charbuf >= charbuf_end)
4822 {
4823 if (byte_after_cr >= 0)
4824 src_base--;
4825 break;
4826 }
4827
4828 if (byte_after_cr >= 0)
4829 c = byte_after_cr, byte_after_cr = -1;
4830 else
4831 ONE_MORE_BYTE (c);
4832
4833 if (c < 0)
4834 goto invalid_code;
4835 if (c < 0x80)
4836 {
4837 if (eol_dos && c == '\r')
4838 ONE_MORE_BYTE (byte_after_cr);
4839 charset = charset_roman;
4840 }
4841 else
4842 {
4843 /* BIG5 -> Big5 */
4844 if (c < 0xA1 || c > 0xFE)
4845 goto invalid_code;
4846 ONE_MORE_BYTE (c1);
4847 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4848 goto invalid_code;
4849 c = c << 8 | c1;
4850 charset = charset_big5;
4851 }
4852 if (charset->id != charset_ascii
4853 && last_id != charset->id)
4854 {
4855 if (last_id != charset_ascii)
4856 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857 last_id = charset->id;
4858 last_offset = char_offset;
4859 }
4860 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4861 *charbuf++ = c;
4862 char_offset++;
4863 continue;
4864
4865 invalid_code:
4866 src = src_base;
4867 consumed_chars = consumed_chars_base;
4868 ONE_MORE_BYTE (c);
4869 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4870 char_offset++;
4871 coding->errors++;
4872 }
4873
4874 no_more_source:
4875 if (last_id != charset_ascii)
4876 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4877 coding->consumed_char += consumed_chars_base;
4878 coding->consumed = src_base - coding->source;
4879 coding->charbuf_used = charbuf - coding->charbuf;
4880 }
4881
4882 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4883 This function can encode charsets `ascii', `katakana-jisx0201',
4884 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4885 are sure that all these charsets are registered as official charset
4886 (i.e. do not have extended leading-codes). Characters of other
4887 charsets are produced without any encoding. */
4888
4889 static bool
4890 encode_coding_sjis (struct coding_system *coding)
4891 {
4892 bool multibytep = coding->dst_multibyte;
4893 int *charbuf = coding->charbuf;
4894 int *charbuf_end = charbuf + coding->charbuf_used;
4895 unsigned char *dst = coding->destination + coding->produced;
4896 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4897 int safe_room = 4;
4898 ptrdiff_t produced_chars = 0;
4899 Lisp_Object attrs, charset_list, val;
4900 bool ascii_compatible;
4901 struct charset *charset_kanji, *charset_kana;
4902 struct charset *charset_kanji2;
4903 int c;
4904
4905 CODING_GET_INFO (coding, attrs, charset_list);
4906 val = XCDR (charset_list);
4907 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4908 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4909 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4910
4911 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4912
4913 while (charbuf < charbuf_end)
4914 {
4915 ASSURE_DESTINATION (safe_room);
4916 c = *charbuf++;
4917 /* Now encode the character C. */
4918 if (ASCII_CHAR_P (c) && ascii_compatible)
4919 EMIT_ONE_ASCII_BYTE (c);
4920 else if (CHAR_BYTE8_P (c))
4921 {
4922 c = CHAR_TO_BYTE8 (c);
4923 EMIT_ONE_BYTE (c);
4924 }
4925 else
4926 {
4927 unsigned code;
4928 struct charset *charset;
4929 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4930 &code, charset);
4931
4932 if (!charset)
4933 {
4934 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4935 {
4936 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4937 charset = CHARSET_FROM_ID (charset_ascii);
4938 }
4939 else
4940 {
4941 c = coding->default_char;
4942 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4943 charset_list, &code, charset);
4944 }
4945 }
4946 if (code == CHARSET_INVALID_CODE (charset))
4947 emacs_abort ();
4948 if (charset == charset_kanji)
4949 {
4950 int c1, c2;
4951 JIS_TO_SJIS (code);
4952 c1 = code >> 8, c2 = code & 0xFF;
4953 EMIT_TWO_BYTES (c1, c2);
4954 }
4955 else if (charset == charset_kana)
4956 EMIT_ONE_BYTE (code | 0x80);
4957 else if (charset_kanji2 && charset == charset_kanji2)
4958 {
4959 int c1, c2;
4960
4961 c1 = code >> 8;
4962 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4963 || c1 == 0x28
4964 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4965 {
4966 JIS_TO_SJIS2 (code);
4967 c1 = code >> 8, c2 = code & 0xFF;
4968 EMIT_TWO_BYTES (c1, c2);
4969 }
4970 else
4971 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4972 }
4973 else
4974 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4975 }
4976 }
4977 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4978 coding->produced_char += produced_chars;
4979 coding->produced = dst - coding->destination;
4980 return 0;
4981 }
4982
4983 static bool
4984 encode_coding_big5 (struct coding_system *coding)
4985 {
4986 bool multibytep = coding->dst_multibyte;
4987 int *charbuf = coding->charbuf;
4988 int *charbuf_end = charbuf + coding->charbuf_used;
4989 unsigned char *dst = coding->destination + coding->produced;
4990 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4991 int safe_room = 4;
4992 ptrdiff_t produced_chars = 0;
4993 Lisp_Object attrs, charset_list, val;
4994 bool ascii_compatible;
4995 struct charset *charset_big5;
4996 int c;
4997
4998 CODING_GET_INFO (coding, attrs, charset_list);
4999 val = XCDR (charset_list);
5000 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5001 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5002
5003 while (charbuf < charbuf_end)
5004 {
5005 ASSURE_DESTINATION (safe_room);
5006 c = *charbuf++;
5007 /* Now encode the character C. */
5008 if (ASCII_CHAR_P (c) && ascii_compatible)
5009 EMIT_ONE_ASCII_BYTE (c);
5010 else if (CHAR_BYTE8_P (c))
5011 {
5012 c = CHAR_TO_BYTE8 (c);
5013 EMIT_ONE_BYTE (c);
5014 }
5015 else
5016 {
5017 unsigned code;
5018 struct charset *charset;
5019 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5020 &code, charset);
5021
5022 if (! charset)
5023 {
5024 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5025 {
5026 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5027 charset = CHARSET_FROM_ID (charset_ascii);
5028 }
5029 else
5030 {
5031 c = coding->default_char;
5032 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5033 charset_list, &code, charset);
5034 }
5035 }
5036 if (code == CHARSET_INVALID_CODE (charset))
5037 emacs_abort ();
5038 if (charset == charset_big5)
5039 {
5040 int c1, c2;
5041
5042 c1 = code >> 8, c2 = code & 0xFF;
5043 EMIT_TWO_BYTES (c1, c2);
5044 }
5045 else
5046 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5047 }
5048 }
5049 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5050 coding->produced_char += produced_chars;
5051 coding->produced = dst - coding->destination;
5052 return 0;
5053 }
5054
5055 \f
5056 /*** 10. CCL handlers ***/
5057
5058 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5059 Return true if a text is encoded in a coding system of which
5060 encoder/decoder are written in CCL program. */
5061
5062 static bool
5063 detect_coding_ccl (struct coding_system *coding,
5064 struct coding_detection_info *detect_info)
5065 {
5066 const unsigned char *src = coding->source, *src_base;
5067 const unsigned char *src_end = coding->source + coding->src_bytes;
5068 bool multibytep = coding->src_multibyte;
5069 ptrdiff_t consumed_chars = 0;
5070 int found = 0;
5071 unsigned char *valids;
5072 ptrdiff_t head_ascii = coding->head_ascii;
5073 Lisp_Object attrs;
5074
5075 detect_info->checked |= CATEGORY_MASK_CCL;
5076
5077 coding = &coding_categories[coding_category_ccl];
5078 valids = CODING_CCL_VALIDS (coding);
5079 attrs = CODING_ID_ATTRS (coding->id);
5080 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5081 src += head_ascii;
5082
5083 while (1)
5084 {
5085 int c;
5086
5087 src_base = src;
5088 ONE_MORE_BYTE (c);
5089 if (c < 0 || ! valids[c])
5090 break;
5091 if ((valids[c] > 1))
5092 found = CATEGORY_MASK_CCL;
5093 }
5094 detect_info->rejected |= CATEGORY_MASK_CCL;
5095 return 0;
5096
5097 no_more_source:
5098 detect_info->found |= found;
5099 return 1;
5100 }
5101
5102 static void
5103 decode_coding_ccl (struct coding_system *coding)
5104 {
5105 const unsigned char *src = coding->source + coding->consumed;
5106 const unsigned char *src_end = coding->source + coding->src_bytes;
5107 int *charbuf = coding->charbuf + coding->charbuf_used;
5108 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5109 ptrdiff_t consumed_chars = 0;
5110 bool multibytep = coding->src_multibyte;
5111 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5112 int source_charbuf[1024];
5113 int source_byteidx[1025];
5114 Lisp_Object attrs, charset_list;
5115
5116 CODING_GET_INFO (coding, attrs, charset_list);
5117
5118 while (1)
5119 {
5120 const unsigned char *p = src;
5121 ptrdiff_t offset;
5122 int i = 0;
5123
5124 if (multibytep)
5125 {
5126 while (i < 1024 && p < src_end)
5127 {
5128 source_byteidx[i] = p - src;
5129 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5130 }
5131 source_byteidx[i] = p - src;
5132 }
5133 else
5134 while (i < 1024 && p < src_end)
5135 source_charbuf[i++] = *p++;
5136
5137 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5138 ccl->last_block = 1;
5139 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5140 charset_map_loaded = 0;
5141 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5142 charset_list);
5143 if (charset_map_loaded
5144 && (offset = coding_change_source (coding)))
5145 {
5146 p += offset;
5147 src += offset;
5148 src_end += offset;
5149 }
5150 charbuf += ccl->produced;
5151 if (multibytep)
5152 src += source_byteidx[ccl->consumed];
5153 else
5154 src += ccl->consumed;
5155 consumed_chars += ccl->consumed;
5156 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5157 break;
5158 }
5159
5160 switch (ccl->status)
5161 {
5162 case CCL_STAT_SUSPEND_BY_SRC:
5163 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5164 break;
5165 case CCL_STAT_SUSPEND_BY_DST:
5166 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5167 break;
5168 case CCL_STAT_QUIT:
5169 case CCL_STAT_INVALID_CMD:
5170 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5171 break;
5172 default:
5173 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5174 break;
5175 }
5176 coding->consumed_char += consumed_chars;
5177 coding->consumed = src - coding->source;
5178 coding->charbuf_used = charbuf - coding->charbuf;
5179 }
5180
5181 static bool
5182 encode_coding_ccl (struct coding_system *coding)
5183 {
5184 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5185 bool multibytep = coding->dst_multibyte;
5186 int *charbuf = coding->charbuf;
5187 int *charbuf_end = charbuf + coding->charbuf_used;
5188 unsigned char *dst = coding->destination + coding->produced;
5189 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5190 int destination_charbuf[1024];
5191 ptrdiff_t produced_chars = 0;
5192 int i;
5193 Lisp_Object attrs, charset_list;
5194
5195 CODING_GET_INFO (coding, attrs, charset_list);
5196 if (coding->consumed_char == coding->src_chars
5197 && coding->mode & CODING_MODE_LAST_BLOCK)
5198 ccl->last_block = 1;
5199
5200 do
5201 {
5202 ptrdiff_t offset;
5203
5204 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5205 charset_map_loaded = 0;
5206 ccl_driver (ccl, charbuf, destination_charbuf,
5207 charbuf_end - charbuf, 1024, charset_list);
5208 if (charset_map_loaded
5209 && (offset = coding_change_destination (coding)))
5210 dst += offset;
5211 if (multibytep)
5212 {
5213 ASSURE_DESTINATION (ccl->produced * 2);
5214 for (i = 0; i < ccl->produced; i++)
5215 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5216 }
5217 else
5218 {
5219 ASSURE_DESTINATION (ccl->produced);
5220 for (i = 0; i < ccl->produced; i++)
5221 *dst++ = destination_charbuf[i] & 0xFF;
5222 produced_chars += ccl->produced;
5223 }
5224 charbuf += ccl->consumed;
5225 if (ccl->status == CCL_STAT_QUIT
5226 || ccl->status == CCL_STAT_INVALID_CMD)
5227 break;
5228 }
5229 while (charbuf < charbuf_end);
5230
5231 switch (ccl->status)
5232 {
5233 case CCL_STAT_SUSPEND_BY_SRC:
5234 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5235 break;
5236 case CCL_STAT_SUSPEND_BY_DST:
5237 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5238 break;
5239 case CCL_STAT_QUIT:
5240 case CCL_STAT_INVALID_CMD:
5241 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5242 break;
5243 default:
5244 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5245 break;
5246 }
5247
5248 coding->produced_char += produced_chars;
5249 coding->produced = dst - coding->destination;
5250 return 0;
5251 }
5252
5253 \f
5254 /*** 10, 11. no-conversion handlers ***/
5255
5256 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5257
5258 static void
5259 decode_coding_raw_text (struct coding_system *coding)
5260 {
5261 bool eol_dos
5262 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5263
5264 coding->chars_at_source = 1;
5265 coding->consumed_char = coding->src_chars;
5266 coding->consumed = coding->src_bytes;
5267 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5268 {
5269 coding->consumed_char--;
5270 coding->consumed--;
5271 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5272 }
5273 else
5274 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5275 }
5276
5277 static bool
5278 encode_coding_raw_text (struct coding_system *coding)
5279 {
5280 bool multibytep = coding->dst_multibyte;
5281 int *charbuf = coding->charbuf;
5282 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5283 unsigned char *dst = coding->destination + coding->produced;
5284 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5285 ptrdiff_t produced_chars = 0;
5286 int c;
5287
5288 if (multibytep)
5289 {
5290 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5291
5292 if (coding->src_multibyte)
5293 while (charbuf < charbuf_end)
5294 {
5295 ASSURE_DESTINATION (safe_room);
5296 c = *charbuf++;
5297 if (ASCII_CHAR_P (c))
5298 EMIT_ONE_ASCII_BYTE (c);
5299 else if (CHAR_BYTE8_P (c))
5300 {
5301 c = CHAR_TO_BYTE8 (c);
5302 EMIT_ONE_BYTE (c);
5303 }
5304 else
5305 {
5306 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5307
5308 CHAR_STRING_ADVANCE (c, p1);
5309 do
5310 {
5311 EMIT_ONE_BYTE (*p0);
5312 p0++;
5313 }
5314 while (p0 < p1);
5315 }
5316 }
5317 else
5318 while (charbuf < charbuf_end)
5319 {
5320 ASSURE_DESTINATION (safe_room);
5321 c = *charbuf++;
5322 EMIT_ONE_BYTE (c);
5323 }
5324 }
5325 else
5326 {
5327 if (coding->src_multibyte)
5328 {
5329 int safe_room = MAX_MULTIBYTE_LENGTH;
5330
5331 while (charbuf < charbuf_end)
5332 {
5333 ASSURE_DESTINATION (safe_room);
5334 c = *charbuf++;
5335 if (ASCII_CHAR_P (c))
5336 *dst++ = c;
5337 else if (CHAR_BYTE8_P (c))
5338 *dst++ = CHAR_TO_BYTE8 (c);
5339 else
5340 CHAR_STRING_ADVANCE (c, dst);
5341 }
5342 }
5343 else
5344 {
5345 ASSURE_DESTINATION (charbuf_end - charbuf);
5346 while (charbuf < charbuf_end && dst < dst_end)
5347 *dst++ = *charbuf++;
5348 }
5349 produced_chars = dst - (coding->destination + coding->produced);
5350 }
5351 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5352 coding->produced_char += produced_chars;
5353 coding->produced = dst - coding->destination;
5354 return 0;
5355 }
5356
5357 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5358 Return true if a text is encoded in a charset-based coding system. */
5359
5360 static bool
5361 detect_coding_charset (struct coding_system *coding,
5362 struct coding_detection_info *detect_info)
5363 {
5364 const unsigned char *src = coding->source, *src_base;
5365 const unsigned char *src_end = coding->source + coding->src_bytes;
5366 bool multibytep = coding->src_multibyte;
5367 ptrdiff_t consumed_chars = 0;
5368 Lisp_Object attrs, valids, name;
5369 int found = 0;
5370 ptrdiff_t head_ascii = coding->head_ascii;
5371 bool check_latin_extra = 0;
5372
5373 detect_info->checked |= CATEGORY_MASK_CHARSET;
5374
5375 coding = &coding_categories[coding_category_charset];
5376 attrs = CODING_ID_ATTRS (coding->id);
5377 valids = AREF (attrs, coding_attr_charset_valids);
5378 name = CODING_ID_NAME (coding->id);
5379 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5380 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5381 || strncmp (SSDATA (SYMBOL_NAME (name)),
5382 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5383 check_latin_extra = 1;
5384
5385 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5386 src += head_ascii;
5387
5388 while (1)
5389 {
5390 int c;
5391 Lisp_Object val;
5392 struct charset *charset;
5393 int dim, idx;
5394
5395 src_base = src;
5396 ONE_MORE_BYTE (c);
5397 if (c < 0)
5398 continue;
5399 val = AREF (valids, c);
5400 if (NILP (val))
5401 break;
5402 if (c >= 0x80)
5403 {
5404 if (c < 0xA0
5405 && check_latin_extra
5406 && (!VECTORP (Vlatin_extra_code_table)
5407 || NILP (AREF (Vlatin_extra_code_table, c))))
5408 break;
5409 found = CATEGORY_MASK_CHARSET;
5410 }
5411 if (INTEGERP (val))
5412 {
5413 charset = CHARSET_FROM_ID (XFASTINT (val));
5414 dim = CHARSET_DIMENSION (charset);
5415 for (idx = 1; idx < dim; idx++)
5416 {
5417 if (src == src_end)
5418 goto too_short;
5419 ONE_MORE_BYTE (c);
5420 if (c < charset->code_space[(dim - 1 - idx) * 4]
5421 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5422 break;
5423 }
5424 if (idx < dim)
5425 break;
5426 }
5427 else
5428 {
5429 idx = 1;
5430 for (; CONSP (val); val = XCDR (val))
5431 {
5432 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5433 dim = CHARSET_DIMENSION (charset);
5434 while (idx < dim)
5435 {
5436 if (src == src_end)
5437 goto too_short;
5438 ONE_MORE_BYTE (c);
5439 if (c < charset->code_space[(dim - 1 - idx) * 4]
5440 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5441 break;
5442 idx++;
5443 }
5444 if (idx == dim)
5445 {
5446 val = Qnil;
5447 break;
5448 }
5449 }
5450 if (CONSP (val))
5451 break;
5452 }
5453 }
5454 too_short:
5455 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5456 return 0;
5457
5458 no_more_source:
5459 detect_info->found |= found;
5460 return 1;
5461 }
5462
5463 static void
5464 decode_coding_charset (struct coding_system *coding)
5465 {
5466 const unsigned char *src = coding->source + coding->consumed;
5467 const unsigned char *src_end = coding->source + coding->src_bytes;
5468 const unsigned char *src_base;
5469 int *charbuf = coding->charbuf + coding->charbuf_used;
5470 /* We may produce one charset annotation in one loop and one more at
5471 the end. */
5472 int *charbuf_end
5473 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5474 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5475 bool multibytep = coding->src_multibyte;
5476 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5477 Lisp_Object valids;
5478 ptrdiff_t char_offset = coding->produced_char;
5479 ptrdiff_t last_offset = char_offset;
5480 int last_id = charset_ascii;
5481 bool eol_dos
5482 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5483 int byte_after_cr = -1;
5484
5485 valids = AREF (attrs, coding_attr_charset_valids);
5486
5487 while (1)
5488 {
5489 int c;
5490 Lisp_Object val;
5491 struct charset *charset;
5492 int dim;
5493 int len = 1;
5494 unsigned code;
5495
5496 src_base = src;
5497 consumed_chars_base = consumed_chars;
5498
5499 if (charbuf >= charbuf_end)
5500 {
5501 if (byte_after_cr >= 0)
5502 src_base--;
5503 break;
5504 }
5505
5506 if (byte_after_cr >= 0)
5507 {
5508 c = byte_after_cr;
5509 byte_after_cr = -1;
5510 }
5511 else
5512 {
5513 ONE_MORE_BYTE (c);
5514 if (eol_dos && c == '\r')
5515 ONE_MORE_BYTE (byte_after_cr);
5516 }
5517 if (c < 0)
5518 goto invalid_code;
5519 code = c;
5520
5521 val = AREF (valids, c);
5522 if (! INTEGERP (val) && ! CONSP (val))
5523 goto invalid_code;
5524 if (INTEGERP (val))
5525 {
5526 charset = CHARSET_FROM_ID (XFASTINT (val));
5527 dim = CHARSET_DIMENSION (charset);
5528 while (len < dim)
5529 {
5530 ONE_MORE_BYTE (c);
5531 code = (code << 8) | c;
5532 len++;
5533 }
5534 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5535 charset, code, c);
5536 }
5537 else
5538 {
5539 /* VAL is a list of charset IDs. It is assured that the
5540 list is sorted by charset dimensions (smaller one
5541 comes first). */
5542 while (CONSP (val))
5543 {
5544 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5545 dim = CHARSET_DIMENSION (charset);
5546 while (len < dim)
5547 {
5548 ONE_MORE_BYTE (c);
5549 code = (code << 8) | c;
5550 len++;
5551 }
5552 CODING_DECODE_CHAR (coding, src, src_base,
5553 src_end, charset, code, c);
5554 if (c >= 0)
5555 break;
5556 val = XCDR (val);
5557 }
5558 }
5559 if (c < 0)
5560 goto invalid_code;
5561 if (charset->id != charset_ascii
5562 && last_id != charset->id)
5563 {
5564 if (last_id != charset_ascii)
5565 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5566 last_id = charset->id;
5567 last_offset = char_offset;
5568 }
5569
5570 *charbuf++ = c;
5571 char_offset++;
5572 continue;
5573
5574 invalid_code:
5575 src = src_base;
5576 consumed_chars = consumed_chars_base;
5577 ONE_MORE_BYTE (c);
5578 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5579 char_offset++;
5580 coding->errors++;
5581 }
5582
5583 no_more_source:
5584 if (last_id != charset_ascii)
5585 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5586 coding->consumed_char += consumed_chars_base;
5587 coding->consumed = src_base - coding->source;
5588 coding->charbuf_used = charbuf - coding->charbuf;
5589 }
5590
5591 static bool
5592 encode_coding_charset (struct coding_system *coding)
5593 {
5594 bool multibytep = coding->dst_multibyte;
5595 int *charbuf = coding->charbuf;
5596 int *charbuf_end = charbuf + coding->charbuf_used;
5597 unsigned char *dst = coding->destination + coding->produced;
5598 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5599 int safe_room = MAX_MULTIBYTE_LENGTH;
5600 ptrdiff_t produced_chars = 0;
5601 Lisp_Object attrs, charset_list;
5602 bool ascii_compatible;
5603 int c;
5604
5605 CODING_GET_INFO (coding, attrs, charset_list);
5606 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5607
5608 while (charbuf < charbuf_end)
5609 {
5610 struct charset *charset;
5611 unsigned code;
5612
5613 ASSURE_DESTINATION (safe_room);
5614 c = *charbuf++;
5615 if (ascii_compatible && ASCII_CHAR_P (c))
5616 EMIT_ONE_ASCII_BYTE (c);
5617 else if (CHAR_BYTE8_P (c))
5618 {
5619 c = CHAR_TO_BYTE8 (c);
5620 EMIT_ONE_BYTE (c);
5621 }
5622 else
5623 {
5624 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5625 &code, charset);
5626
5627 if (charset)
5628 {
5629 if (CHARSET_DIMENSION (charset) == 1)
5630 EMIT_ONE_BYTE (code);
5631 else if (CHARSET_DIMENSION (charset) == 2)
5632 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5633 else if (CHARSET_DIMENSION (charset) == 3)
5634 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5635 else
5636 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5637 (code >> 8) & 0xFF, code & 0xFF);
5638 }
5639 else
5640 {
5641 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5642 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5643 else
5644 c = coding->default_char;
5645 EMIT_ONE_BYTE (c);
5646 }
5647 }
5648 }
5649
5650 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5651 coding->produced_char += produced_chars;
5652 coding->produced = dst - coding->destination;
5653 return 0;
5654 }
5655
5656 \f
5657 /*** 7. C library functions ***/
5658
5659 /* Setup coding context CODING from information about CODING_SYSTEM.
5660 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5661 CODING_SYSTEM is invalid, signal an error. */
5662
5663 void
5664 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5665 {
5666 Lisp_Object attrs;
5667 Lisp_Object eol_type;
5668 Lisp_Object coding_type;
5669 Lisp_Object val;
5670
5671 if (NILP (coding_system))
5672 coding_system = Qundecided;
5673
5674 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5675
5676 attrs = CODING_ID_ATTRS (coding->id);
5677 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5678
5679 coding->mode = 0;
5680 if (VECTORP (eol_type))
5681 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5682 | CODING_REQUIRE_DETECTION_MASK);
5683 else if (! EQ (eol_type, Qunix))
5684 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5685 | CODING_REQUIRE_ENCODING_MASK);
5686 else
5687 coding->common_flags = 0;
5688 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5689 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5690 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5691 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5692 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5693 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5694
5695 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5696 coding->max_charset_id = SCHARS (val) - 1;
5697 coding->safe_charsets = SDATA (val);
5698 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5699 coding->carryover_bytes = 0;
5700
5701 coding_type = CODING_ATTR_TYPE (attrs);
5702 if (EQ (coding_type, Qundecided))
5703 {
5704 coding->detector = NULL;
5705 coding->decoder = decode_coding_raw_text;
5706 coding->encoder = encode_coding_raw_text;
5707 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5708 coding->spec.undecided.inhibit_nbd
5709 = (NILP (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection))
5710 ? -1
5711 : EQ (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection), Qt)
5712 ? 1
5713 : 0);
5714 coding->spec.undecided.inhibit_ied
5715 = (NILP (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection))
5716 ? -1
5717 : EQ (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection), Qt)
5718 ? 1
5719 : 0);
5720 coding->spec.undecided.prefer_utf_8
5721 = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5722 }
5723 else if (EQ (coding_type, Qiso_2022))
5724 {
5725 int i;
5726 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5727
5728 /* Invoke graphic register 0 to plane 0. */
5729 CODING_ISO_INVOCATION (coding, 0) = 0;
5730 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5731 CODING_ISO_INVOCATION (coding, 1)
5732 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5733 /* Setup the initial status of designation. */
5734 for (i = 0; i < 4; i++)
5735 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5736 /* Not single shifting initially. */
5737 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5738 /* Beginning of buffer should also be regarded as bol. */
5739 CODING_ISO_BOL (coding) = 1;
5740 coding->detector = detect_coding_iso_2022;
5741 coding->decoder = decode_coding_iso_2022;
5742 coding->encoder = encode_coding_iso_2022;
5743 if (flags & CODING_ISO_FLAG_SAFE)
5744 coding->mode |= CODING_MODE_SAFE_ENCODING;
5745 coding->common_flags
5746 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5747 | CODING_REQUIRE_FLUSHING_MASK);
5748 if (flags & CODING_ISO_FLAG_COMPOSITION)
5749 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5750 if (flags & CODING_ISO_FLAG_DESIGNATION)
5751 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5752 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5753 {
5754 setup_iso_safe_charsets (attrs);
5755 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5756 coding->max_charset_id = SCHARS (val) - 1;
5757 coding->safe_charsets = SDATA (val);
5758 }
5759 CODING_ISO_FLAGS (coding) = flags;
5760 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5761 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5762 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5763 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5764 }
5765 else if (EQ (coding_type, Qcharset))
5766 {
5767 coding->detector = detect_coding_charset;
5768 coding->decoder = decode_coding_charset;
5769 coding->encoder = encode_coding_charset;
5770 coding->common_flags
5771 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5772 }
5773 else if (EQ (coding_type, Qutf_8))
5774 {
5775 val = AREF (attrs, coding_attr_utf_bom);
5776 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5777 : EQ (val, Qt) ? utf_with_bom
5778 : utf_without_bom);
5779 coding->detector = detect_coding_utf_8;
5780 coding->decoder = decode_coding_utf_8;
5781 coding->encoder = encode_coding_utf_8;
5782 coding->common_flags
5783 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5784 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5785 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5786 }
5787 else if (EQ (coding_type, Qutf_16))
5788 {
5789 val = AREF (attrs, coding_attr_utf_bom);
5790 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5791 : EQ (val, Qt) ? utf_with_bom
5792 : utf_without_bom);
5793 val = AREF (attrs, coding_attr_utf_16_endian);
5794 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5795 : utf_16_little_endian);
5796 CODING_UTF_16_SURROGATE (coding) = 0;
5797 coding->detector = detect_coding_utf_16;
5798 coding->decoder = decode_coding_utf_16;
5799 coding->encoder = encode_coding_utf_16;
5800 coding->common_flags
5801 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5802 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5803 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5804 }
5805 else if (EQ (coding_type, Qccl))
5806 {
5807 coding->detector = detect_coding_ccl;
5808 coding->decoder = decode_coding_ccl;
5809 coding->encoder = encode_coding_ccl;
5810 coding->common_flags
5811 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5812 | CODING_REQUIRE_FLUSHING_MASK);
5813 }
5814 else if (EQ (coding_type, Qemacs_mule))
5815 {
5816 coding->detector = detect_coding_emacs_mule;
5817 coding->decoder = decode_coding_emacs_mule;
5818 coding->encoder = encode_coding_emacs_mule;
5819 coding->common_flags
5820 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5821 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5822 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5823 {
5824 Lisp_Object tail, safe_charsets;
5825 int max_charset_id = 0;
5826
5827 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5828 tail = XCDR (tail))
5829 if (max_charset_id < XFASTINT (XCAR (tail)))
5830 max_charset_id = XFASTINT (XCAR (tail));
5831 safe_charsets = make_uninit_string (max_charset_id + 1);
5832 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5833 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5834 tail = XCDR (tail))
5835 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5836 coding->max_charset_id = max_charset_id;
5837 coding->safe_charsets = SDATA (safe_charsets);
5838 }
5839 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5840 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5841 }
5842 else if (EQ (coding_type, Qshift_jis))
5843 {
5844 coding->detector = detect_coding_sjis;
5845 coding->decoder = decode_coding_sjis;
5846 coding->encoder = encode_coding_sjis;
5847 coding->common_flags
5848 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849 }
5850 else if (EQ (coding_type, Qbig5))
5851 {
5852 coding->detector = detect_coding_big5;
5853 coding->decoder = decode_coding_big5;
5854 coding->encoder = encode_coding_big5;
5855 coding->common_flags
5856 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5857 }
5858 else /* EQ (coding_type, Qraw_text) */
5859 {
5860 coding->detector = NULL;
5861 coding->decoder = decode_coding_raw_text;
5862 coding->encoder = encode_coding_raw_text;
5863 if (! EQ (eol_type, Qunix))
5864 {
5865 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5866 if (! VECTORP (eol_type))
5867 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5868 }
5869
5870 }
5871
5872 return;
5873 }
5874
5875 /* Return a list of charsets supported by CODING. */
5876
5877 Lisp_Object
5878 coding_charset_list (struct coding_system *coding)
5879 {
5880 Lisp_Object attrs, charset_list;
5881
5882 CODING_GET_INFO (coding, attrs, charset_list);
5883 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5884 {
5885 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5886
5887 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5888 charset_list = Viso_2022_charset_list;
5889 }
5890 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5891 {
5892 charset_list = Vemacs_mule_charset_list;
5893 }
5894 return charset_list;
5895 }
5896
5897
5898 /* Return a list of charsets supported by CODING-SYSTEM. */
5899
5900 Lisp_Object
5901 coding_system_charset_list (Lisp_Object coding_system)
5902 {
5903 ptrdiff_t id;
5904 Lisp_Object attrs, charset_list;
5905
5906 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5907 attrs = CODING_ID_ATTRS (id);
5908
5909 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5910 {
5911 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5912
5913 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5914 charset_list = Viso_2022_charset_list;
5915 else
5916 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5917 }
5918 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5919 {
5920 charset_list = Vemacs_mule_charset_list;
5921 }
5922 else
5923 {
5924 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5925 }
5926 return charset_list;
5927 }
5928
5929
5930 /* Return raw-text or one of its subsidiaries that has the same
5931 eol_type as CODING-SYSTEM. */
5932
5933 Lisp_Object
5934 raw_text_coding_system (Lisp_Object coding_system)
5935 {
5936 Lisp_Object spec, attrs;
5937 Lisp_Object eol_type, raw_text_eol_type;
5938
5939 if (NILP (coding_system))
5940 return Qraw_text;
5941 spec = CODING_SYSTEM_SPEC (coding_system);
5942 attrs = AREF (spec, 0);
5943
5944 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5945 return coding_system;
5946
5947 eol_type = AREF (spec, 2);
5948 if (VECTORP (eol_type))
5949 return Qraw_text;
5950 spec = CODING_SYSTEM_SPEC (Qraw_text);
5951 raw_text_eol_type = AREF (spec, 2);
5952 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5953 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5954 : AREF (raw_text_eol_type, 2));
5955 }
5956
5957
5958 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5959 the subsidiary that has the same eol-spec as PARENT (if it is not
5960 nil and specifies end-of-line format) or the system's setting
5961 (system_eol_type). */
5962
5963 Lisp_Object
5964 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5965 {
5966 Lisp_Object spec, eol_type;
5967
5968 if (NILP (coding_system))
5969 coding_system = Qraw_text;
5970 spec = CODING_SYSTEM_SPEC (coding_system);
5971 eol_type = AREF (spec, 2);
5972 if (VECTORP (eol_type))
5973 {
5974 Lisp_Object parent_eol_type;
5975
5976 if (! NILP (parent))
5977 {
5978 Lisp_Object parent_spec;
5979
5980 parent_spec = CODING_SYSTEM_SPEC (parent);
5981 parent_eol_type = AREF (parent_spec, 2);
5982 if (VECTORP (parent_eol_type))
5983 parent_eol_type = system_eol_type;
5984 }
5985 else
5986 parent_eol_type = system_eol_type;
5987 if (EQ (parent_eol_type, Qunix))
5988 coding_system = AREF (eol_type, 0);
5989 else if (EQ (parent_eol_type, Qdos))
5990 coding_system = AREF (eol_type, 1);
5991 else if (EQ (parent_eol_type, Qmac))
5992 coding_system = AREF (eol_type, 2);
5993 }
5994 return coding_system;
5995 }
5996
5997
5998 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5999 decided for writing to a process. If not, complement them, and
6000 return a new coding system. */
6001
6002 Lisp_Object
6003 complement_process_encoding_system (Lisp_Object coding_system)
6004 {
6005 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6006 Lisp_Object spec, attrs;
6007 int i;
6008
6009 for (i = 0; i < 3; i++)
6010 {
6011 if (i == 1)
6012 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6013 else if (i == 2)
6014 coding_system = preferred_coding_system ();
6015 spec = CODING_SYSTEM_SPEC (coding_system);
6016 if (NILP (spec))
6017 continue;
6018 attrs = AREF (spec, 0);
6019 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6020 coding_base = CODING_ATTR_BASE_NAME (attrs);
6021 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6022 eol_base = coding_system;
6023 if (! NILP (coding_base) && ! NILP (eol_base))
6024 break;
6025 }
6026
6027 if (i > 0)
6028 /* The original CODING_SYSTEM didn't specify text-conversion or
6029 eol-conversion. Be sure that we return a fully complemented
6030 coding system. */
6031 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6032 return coding_system;
6033 }
6034
6035
6036 /* Emacs has a mechanism to automatically detect a coding system if it
6037 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6038 it's impossible to distinguish some coding systems accurately
6039 because they use the same range of codes. So, at first, coding
6040 systems are categorized into 7, those are:
6041
6042 o coding-category-emacs-mule
6043
6044 The category for a coding system which has the same code range
6045 as Emacs' internal format. Assigned the coding-system (Lisp
6046 symbol) `emacs-mule' by default.
6047
6048 o coding-category-sjis
6049
6050 The category for a coding system which has the same code range
6051 as SJIS. Assigned the coding-system (Lisp
6052 symbol) `japanese-shift-jis' by default.
6053
6054 o coding-category-iso-7
6055
6056 The category for a coding system which has the same code range
6057 as ISO2022 of 7-bit environment. This doesn't use any locking
6058 shift and single shift functions. This can encode/decode all
6059 charsets. Assigned the coding-system (Lisp symbol)
6060 `iso-2022-7bit' by default.
6061
6062 o coding-category-iso-7-tight
6063
6064 Same as coding-category-iso-7 except that this can
6065 encode/decode only the specified charsets.
6066
6067 o coding-category-iso-8-1
6068
6069 The category for a coding system which has the same code range
6070 as ISO2022 of 8-bit environment and graphic plane 1 used only
6071 for DIMENSION1 charset. This doesn't use any locking shift
6072 and single shift functions. Assigned the coding-system (Lisp
6073 symbol) `iso-latin-1' by default.
6074
6075 o coding-category-iso-8-2
6076
6077 The category for a coding system which has the same code range
6078 as ISO2022 of 8-bit environment and graphic plane 1 used only
6079 for DIMENSION2 charset. This doesn't use any locking shift
6080 and single shift functions. Assigned the coding-system (Lisp
6081 symbol) `japanese-iso-8bit' by default.
6082
6083 o coding-category-iso-7-else
6084
6085 The category for a coding system which has the same code range
6086 as ISO2022 of 7-bit environment but uses locking shift or
6087 single shift functions. Assigned the coding-system (Lisp
6088 symbol) `iso-2022-7bit-lock' by default.
6089
6090 o coding-category-iso-8-else
6091
6092 The category for a coding system which has the same code range
6093 as ISO2022 of 8-bit environment but uses locking shift or
6094 single shift functions. Assigned the coding-system (Lisp
6095 symbol) `iso-2022-8bit-ss2' by default.
6096
6097 o coding-category-big5
6098
6099 The category for a coding system which has the same code range
6100 as BIG5. Assigned the coding-system (Lisp symbol)
6101 `cn-big5' by default.
6102
6103 o coding-category-utf-8
6104
6105 The category for a coding system which has the same code range
6106 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6107 symbol) `utf-8' by default.
6108
6109 o coding-category-utf-16-be
6110
6111 The category for a coding system in which a text has an
6112 Unicode signature (cf. Unicode Standard) in the order of BIG
6113 endian at the head. Assigned the coding-system (Lisp symbol)
6114 `utf-16-be' by default.
6115
6116 o coding-category-utf-16-le
6117
6118 The category for a coding system in which a text has an
6119 Unicode signature (cf. Unicode Standard) in the order of
6120 LITTLE endian at the head. Assigned the coding-system (Lisp
6121 symbol) `utf-16-le' by default.
6122
6123 o coding-category-ccl
6124
6125 The category for a coding system of which encoder/decoder is
6126 written in CCL programs. The default value is nil, i.e., no
6127 coding system is assigned.
6128
6129 o coding-category-binary
6130
6131 The category for a coding system not categorized in any of the
6132 above. Assigned the coding-system (Lisp symbol)
6133 `no-conversion' by default.
6134
6135 Each of them is a Lisp symbol and the value is an actual
6136 `coding-system's (this is also a Lisp symbol) assigned by a user.
6137 What Emacs does actually is to detect a category of coding system.
6138 Then, it uses a `coding-system' assigned to it. If Emacs can't
6139 decide only one possible category, it selects a category of the
6140 highest priority. Priorities of categories are also specified by a
6141 user in a Lisp variable `coding-category-list'.
6142
6143 */
6144
6145 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6146 int eol_seen);
6147
6148
6149 /* Return the number of ASCII characters at the head of the source.
6150 By side effects, set coding->head_ascii and update
6151 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6152 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6153 reliable only when all the source bytes are ASCII. */
6154
6155 static int
6156 check_ascii (struct coding_system *coding)
6157 {
6158 const unsigned char *src, *end;
6159 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6160 int eol_seen = coding->eol_seen;
6161
6162 coding_set_source (coding);
6163 src = coding->source;
6164 end = src + coding->src_bytes;
6165
6166 if (inhibit_eol_conversion
6167 || SYMBOLP (eol_type))
6168 {
6169 /* We don't have to check EOL format. */
6170 while (src < end && !( *src & 0x80))
6171 {
6172 if (*src++ == '\n')
6173 eol_seen |= EOL_SEEN_LF;
6174 }
6175 }
6176 else
6177 {
6178 end--; /* We look ahead one byte for "CR LF". */
6179 while (src < end)
6180 {
6181 int c = *src;
6182
6183 if (c & 0x80)
6184 break;
6185 src++;
6186 if (c == '\r')
6187 {
6188 if (*src == '\n')
6189 {
6190 eol_seen |= EOL_SEEN_CRLF;
6191 src++;
6192 }
6193 else
6194 eol_seen |= EOL_SEEN_CR;
6195 }
6196 else if (c == '\n')
6197 eol_seen |= EOL_SEEN_LF;
6198 }
6199 if (src == end)
6200 {
6201 int c = *src;
6202
6203 /* All bytes but the last one C are ASCII. */
6204 if (! (c & 0x80))
6205 {
6206 if (c == '\r')
6207 eol_seen |= EOL_SEEN_CR;
6208 else if (c == '\n')
6209 eol_seen |= EOL_SEEN_LF;
6210 src++;
6211 }
6212 }
6213 }
6214 coding->head_ascii = src - coding->source;
6215 coding->eol_seen = eol_seen;
6216 return (coding->head_ascii);
6217 }
6218
6219
6220 /* Return the number of characters at the source if all the bytes are
6221 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6222 effects, update coding->eol_seen. The value of coding->eol_seen is
6223 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6224 the value is reliable only when all the source bytes are valid
6225 UTF-8. */
6226
6227 static int
6228 check_utf_8 (struct coding_system *coding)
6229 {
6230 const unsigned char *src, *end;
6231 int eol_seen;
6232 int nchars = coding->head_ascii;
6233
6234 if (coding->head_ascii < 0)
6235 check_ascii (coding);
6236 else
6237 coding_set_source (coding);
6238 src = coding->source + coding->head_ascii;
6239 /* We look ahead one byte for CR LF. */
6240 end = coding->source + coding->src_bytes - 1;
6241 eol_seen = coding->eol_seen;
6242 while (src < end)
6243 {
6244 int c = *src;
6245
6246 if (UTF_8_1_OCTET_P (*src))
6247 {
6248 src++;
6249 if (c < 0x20)
6250 {
6251 if (c == '\r')
6252 {
6253 if (*src == '\n')
6254 {
6255 eol_seen |= EOL_SEEN_CRLF;
6256 src++;
6257 nchars++;
6258 }
6259 else
6260 eol_seen |= EOL_SEEN_CR;
6261 }
6262 else if (c == '\n')
6263 eol_seen |= EOL_SEEN_LF;
6264 }
6265 }
6266 else if (UTF_8_2_OCTET_LEADING_P (c))
6267 {
6268 if (c < 0xC2 /* overlong sequence */
6269 || src + 1 >= end
6270 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6271 return -1;
6272 src += 2;
6273 }
6274 else if (UTF_8_3_OCTET_LEADING_P (c))
6275 {
6276 if (src + 2 >= end
6277 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6278 && UTF_8_EXTRA_OCTET_P (src[2])))
6279 return -1;
6280 c = (((c & 0xF) << 12)
6281 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6282 if (c < 0x800 /* overlong sequence */
6283 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6284 return -1;
6285 src += 3;
6286 }
6287 else if (UTF_8_4_OCTET_LEADING_P (c))
6288 {
6289 if (src + 3 >= end
6290 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6291 && UTF_8_EXTRA_OCTET_P (src[2])
6292 && UTF_8_EXTRA_OCTET_P (src[3])))
6293 return -1;
6294 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6295 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6296 if (c < 0x10000 /* overlong sequence */
6297 || c >= 0x110000) /* non-Unicode character */
6298 return -1;
6299 src += 4;
6300 }
6301 else
6302 return -1;
6303 nchars++;
6304 }
6305
6306 if (src == end)
6307 {
6308 if (! UTF_8_1_OCTET_P (*src))
6309 return -1;
6310 nchars++;
6311 if (*src == '\r')
6312 eol_seen |= EOL_SEEN_CR;
6313 else if (*src == '\n')
6314 eol_seen |= EOL_SEEN_LF;
6315 }
6316 coding->eol_seen = eol_seen;
6317 return nchars;
6318 }
6319
6320
6321 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6322 SOURCE is encoded. If CATEGORY is one of
6323 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6324 two-byte, else they are encoded by one-byte.
6325
6326 Return one of EOL_SEEN_XXX. */
6327
6328 #define MAX_EOL_CHECK_COUNT 3
6329
6330 static int
6331 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6332 enum coding_category category)
6333 {
6334 const unsigned char *src = source, *src_end = src + src_bytes;
6335 unsigned char c;
6336 int total = 0;
6337 int eol_seen = EOL_SEEN_NONE;
6338
6339 if ((1 << category) & CATEGORY_MASK_UTF_16)
6340 {
6341 bool msb = category == (coding_category_utf_16_le
6342 | coding_category_utf_16_le_nosig);
6343 bool lsb = !msb;
6344
6345 while (src + 1 < src_end)
6346 {
6347 c = src[lsb];
6348 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6349 {
6350 int this_eol;
6351
6352 if (c == '\n')
6353 this_eol = EOL_SEEN_LF;
6354 else if (src + 3 >= src_end
6355 || src[msb + 2] != 0
6356 || src[lsb + 2] != '\n')
6357 this_eol = EOL_SEEN_CR;
6358 else
6359 {
6360 this_eol = EOL_SEEN_CRLF;
6361 src += 2;
6362 }
6363
6364 if (eol_seen == EOL_SEEN_NONE)
6365 /* This is the first end-of-line. */
6366 eol_seen = this_eol;
6367 else if (eol_seen != this_eol)
6368 {
6369 /* The found type is different from what found before.
6370 Allow for stray ^M characters in DOS EOL files. */
6371 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6372 || (eol_seen == EOL_SEEN_CRLF
6373 && this_eol == EOL_SEEN_CR))
6374 eol_seen = EOL_SEEN_CRLF;
6375 else
6376 {
6377 eol_seen = EOL_SEEN_LF;
6378 break;
6379 }
6380 }
6381 if (++total == MAX_EOL_CHECK_COUNT)
6382 break;
6383 }
6384 src += 2;
6385 }
6386 }
6387 else
6388 while (src < src_end)
6389 {
6390 c = *src++;
6391 if (c == '\n' || c == '\r')
6392 {
6393 int this_eol;
6394
6395 if (c == '\n')
6396 this_eol = EOL_SEEN_LF;
6397 else if (src >= src_end || *src != '\n')
6398 this_eol = EOL_SEEN_CR;
6399 else
6400 this_eol = EOL_SEEN_CRLF, src++;
6401
6402 if (eol_seen == EOL_SEEN_NONE)
6403 /* This is the first end-of-line. */
6404 eol_seen = this_eol;
6405 else if (eol_seen != this_eol)
6406 {
6407 /* The found type is different from what found before.
6408 Allow for stray ^M characters in DOS EOL files. */
6409 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6410 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6411 eol_seen = EOL_SEEN_CRLF;
6412 else
6413 {
6414 eol_seen = EOL_SEEN_LF;
6415 break;
6416 }
6417 }
6418 if (++total == MAX_EOL_CHECK_COUNT)
6419 break;
6420 }
6421 }
6422 return eol_seen;
6423 }
6424
6425
6426 static Lisp_Object
6427 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6428 {
6429 Lisp_Object eol_type;
6430
6431 eol_type = CODING_ID_EOL_TYPE (coding->id);
6432 if (! VECTORP (eol_type))
6433 /* Already adjusted. */
6434 return eol_type;
6435 if (eol_seen & EOL_SEEN_LF)
6436 {
6437 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6438 eol_type = Qunix;
6439 }
6440 else if (eol_seen & EOL_SEEN_CRLF)
6441 {
6442 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6443 eol_type = Qdos;
6444 }
6445 else if (eol_seen & EOL_SEEN_CR)
6446 {
6447 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6448 eol_type = Qmac;
6449 }
6450 return eol_type;
6451 }
6452
6453 /* Detect how a text specified in CODING is encoded. If a coding
6454 system is detected, update fields of CODING by the detected coding
6455 system. */
6456
6457 static void
6458 detect_coding (struct coding_system *coding)
6459 {
6460 const unsigned char *src, *src_end;
6461 unsigned int saved_mode = coding->mode;
6462 Lisp_Object found = Qnil;
6463 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6464
6465 coding->consumed = coding->consumed_char = 0;
6466 coding->produced = coding->produced_char = 0;
6467 coding_set_source (coding);
6468
6469 src_end = coding->source + coding->src_bytes;
6470
6471 coding->eol_seen = EOL_SEEN_NONE;
6472 /* If we have not yet decided the text encoding type, detect it
6473 now. */
6474 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6475 {
6476 int c, i;
6477 struct coding_detection_info detect_info;
6478 bool null_byte_found = 0, eight_bit_found = 0;
6479 int inhibit_nbd /* null byte detection */
6480 = (coding->spec.undecided.inhibit_nbd > 0
6481 | (coding->spec.undecided.inhibit_nbd == 0
6482 & inhibit_null_byte_detection));
6483 int inhibit_ied /* iso escape detection */
6484 = (coding->spec.undecided.inhibit_ied > 0
6485 | (coding->spec.undecided.inhibit_ied == 0
6486 & inhibit_iso_escape_detection));
6487 int prefer_utf_8
6488 = coding->spec.undecided.prefer_utf_8;
6489
6490 coding->head_ascii = 0;
6491 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6492 for (src = coding->source; src < src_end; src++)
6493 {
6494 c = *src;
6495 if (c & 0x80)
6496 {
6497 eight_bit_found = 1;
6498 if (null_byte_found)
6499 break;
6500 }
6501 else if (c < 0x20)
6502 {
6503 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6504 && ! inhibit_ied
6505 && ! detect_info.checked)
6506 {
6507 if (detect_coding_iso_2022 (coding, &detect_info))
6508 {
6509 /* We have scanned the whole data. */
6510 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6511 {
6512 /* We didn't find an 8-bit code. We may
6513 have found a null-byte, but it's very
6514 rare that a binary file conforms to
6515 ISO-2022. */
6516 src = src_end;
6517 coding->head_ascii = src - coding->source;
6518 }
6519 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6520 break;
6521 }
6522 }
6523 else if (! c && !inhibit_nbd)
6524 {
6525 null_byte_found = 1;
6526 if (eight_bit_found)
6527 break;
6528 }
6529 else if (! disable_ascii_optimization
6530 && ! inhibit_eol_conversion)
6531 {
6532 if (c == '\r')
6533 {
6534 if (src < src_end && src[1] == '\n')
6535 {
6536 coding->eol_seen |= EOL_SEEN_CRLF;
6537 src++;
6538 if (! eight_bit_found)
6539 coding->head_ascii++;
6540 }
6541 else
6542 coding->eol_seen |= EOL_SEEN_CR;
6543 }
6544 else if (c == '\n')
6545 {
6546 coding->eol_seen |= EOL_SEEN_LF;
6547 }
6548 }
6549
6550 if (! eight_bit_found)
6551 coding->head_ascii++;
6552 }
6553 else if (! eight_bit_found)
6554 coding->head_ascii++;
6555 }
6556
6557 if (null_byte_found || eight_bit_found
6558 || coding->head_ascii < coding->src_bytes
6559 || detect_info.found)
6560 {
6561 enum coding_category category;
6562 struct coding_system *this;
6563
6564 if (coding->head_ascii == coding->src_bytes)
6565 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6566 for (i = 0; i < coding_category_raw_text; i++)
6567 {
6568 category = coding_priorities[i];
6569 this = coding_categories + category;
6570 if (detect_info.found & (1 << category))
6571 break;
6572 }
6573 else
6574 {
6575 if (null_byte_found)
6576 {
6577 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6578 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6579 }
6580 else if (prefer_utf_8
6581 && detect_coding_utf_8 (coding, &detect_info))
6582 {
6583 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6584 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6585 }
6586 for (i = 0; i < coding_category_raw_text; i++)
6587 {
6588 category = coding_priorities[i];
6589 this = coding_categories + category;
6590 /* Some of this->detector (e.g. detect_coding_sjis)
6591 require this information. */
6592 coding->id = this->id;
6593 if (this->id < 0)
6594 {
6595 /* No coding system of this category is defined. */
6596 detect_info.rejected |= (1 << category);
6597 }
6598 else if (category >= coding_category_raw_text)
6599 continue;
6600 else if (detect_info.checked & (1 << category))
6601 {
6602 if (detect_info.found & (1 << category))
6603 break;
6604 }
6605 else if ((*(this->detector)) (coding, &detect_info)
6606 && detect_info.found & (1 << category))
6607 break;
6608 }
6609 }
6610
6611 if (i < coding_category_raw_text)
6612 {
6613 if (category == coding_category_utf_8_auto)
6614 {
6615 Lisp_Object coding_systems;
6616
6617 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6618 coding_attr_utf_bom);
6619 if (CONSP (coding_systems))
6620 {
6621 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6622 found = XCAR (coding_systems);
6623 else
6624 found = XCDR (coding_systems);
6625 }
6626 else
6627 found = CODING_ID_NAME (this->id);
6628 }
6629 else if (category == coding_category_utf_16_auto)
6630 {
6631 Lisp_Object coding_systems;
6632
6633 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6634 coding_attr_utf_bom);
6635 if (CONSP (coding_systems))
6636 {
6637 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6638 found = XCAR (coding_systems);
6639 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6640 found = XCDR (coding_systems);
6641 }
6642 else
6643 found = CODING_ID_NAME (this->id);
6644 }
6645 else
6646 found = CODING_ID_NAME (this->id);
6647 }
6648 else if (null_byte_found)
6649 found = Qno_conversion;
6650 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6651 == CATEGORY_MASK_ANY)
6652 found = Qraw_text;
6653 else if (detect_info.rejected)
6654 for (i = 0; i < coding_category_raw_text; i++)
6655 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6656 {
6657 this = coding_categories + coding_priorities[i];
6658 found = CODING_ID_NAME (this->id);
6659 break;
6660 }
6661 }
6662 }
6663 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6664 == coding_category_utf_8_auto)
6665 {
6666 Lisp_Object coding_systems;
6667 struct coding_detection_info detect_info;
6668
6669 coding_systems
6670 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6671 detect_info.found = detect_info.rejected = 0;
6672 if (check_ascii (coding) == coding->src_bytes)
6673 {
6674 if (CONSP (coding_systems))
6675 found = XCDR (coding_systems);
6676 }
6677 else
6678 {
6679 if (CONSP (coding_systems)
6680 && detect_coding_utf_8 (coding, &detect_info))
6681 {
6682 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6683 found = XCAR (coding_systems);
6684 else
6685 found = XCDR (coding_systems);
6686 }
6687 }
6688 }
6689 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6690 == coding_category_utf_16_auto)
6691 {
6692 Lisp_Object coding_systems;
6693 struct coding_detection_info detect_info;
6694
6695 coding_systems
6696 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6697 detect_info.found = detect_info.rejected = 0;
6698 coding->head_ascii = 0;
6699 if (CONSP (coding_systems)
6700 && detect_coding_utf_16 (coding, &detect_info))
6701 {
6702 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6703 found = XCAR (coding_systems);
6704 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6705 found = XCDR (coding_systems);
6706 }
6707 }
6708
6709 if (! NILP (found))
6710 {
6711 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6712 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6713 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6714 : EOL_SEEN_LF);
6715
6716 setup_coding_system (found, coding);
6717 if (specified_eol != EOL_SEEN_NONE)
6718 adjust_coding_eol_type (coding, specified_eol);
6719 }
6720
6721 coding->mode = saved_mode;
6722 }
6723
6724
6725 static void
6726 decode_eol (struct coding_system *coding)
6727 {
6728 Lisp_Object eol_type;
6729 unsigned char *p, *pbeg, *pend;
6730
6731 eol_type = CODING_ID_EOL_TYPE (coding->id);
6732 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6733 return;
6734
6735 if (NILP (coding->dst_object))
6736 pbeg = coding->destination;
6737 else
6738 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6739 pend = pbeg + coding->produced;
6740
6741 if (VECTORP (eol_type))
6742 {
6743 int eol_seen = EOL_SEEN_NONE;
6744
6745 for (p = pbeg; p < pend; p++)
6746 {
6747 if (*p == '\n')
6748 eol_seen |= EOL_SEEN_LF;
6749 else if (*p == '\r')
6750 {
6751 if (p + 1 < pend && *(p + 1) == '\n')
6752 {
6753 eol_seen |= EOL_SEEN_CRLF;
6754 p++;
6755 }
6756 else
6757 eol_seen |= EOL_SEEN_CR;
6758 }
6759 }
6760 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6761 if ((eol_seen & EOL_SEEN_CRLF) != 0
6762 && (eol_seen & EOL_SEEN_CR) != 0
6763 && (eol_seen & EOL_SEEN_LF) == 0)
6764 eol_seen = EOL_SEEN_CRLF;
6765 else if (eol_seen != EOL_SEEN_NONE
6766 && eol_seen != EOL_SEEN_LF
6767 && eol_seen != EOL_SEEN_CRLF
6768 && eol_seen != EOL_SEEN_CR)
6769 eol_seen = EOL_SEEN_LF;
6770 if (eol_seen != EOL_SEEN_NONE)
6771 eol_type = adjust_coding_eol_type (coding, eol_seen);
6772 }
6773
6774 if (EQ (eol_type, Qmac))
6775 {
6776 for (p = pbeg; p < pend; p++)
6777 if (*p == '\r')
6778 *p = '\n';
6779 }
6780 else if (EQ (eol_type, Qdos))
6781 {
6782 ptrdiff_t n = 0;
6783
6784 if (NILP (coding->dst_object))
6785 {
6786 /* Start deleting '\r' from the tail to minimize the memory
6787 movement. */
6788 for (p = pend - 2; p >= pbeg; p--)
6789 if (*p == '\r')
6790 {
6791 memmove (p, p + 1, pend-- - p - 1);
6792 n++;
6793 }
6794 }
6795 else
6796 {
6797 ptrdiff_t pos_byte = coding->dst_pos_byte;
6798 ptrdiff_t pos = coding->dst_pos;
6799 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6800
6801 while (pos < pos_end)
6802 {
6803 p = BYTE_POS_ADDR (pos_byte);
6804 if (*p == '\r' && p[1] == '\n')
6805 {
6806 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6807 n++;
6808 pos_end--;
6809 }
6810 pos++;
6811 if (coding->dst_multibyte)
6812 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6813 else
6814 pos_byte++;
6815 }
6816 }
6817 coding->produced -= n;
6818 coding->produced_char -= n;
6819 }
6820 }
6821
6822
6823 /* Return a translation table (or list of them) from coding system
6824 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6825 not ENCODEP). */
6826
6827 static Lisp_Object
6828 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6829 {
6830 Lisp_Object standard, translation_table;
6831 Lisp_Object val;
6832
6833 if (NILP (Venable_character_translation))
6834 {
6835 if (max_lookup)
6836 *max_lookup = 0;
6837 return Qnil;
6838 }
6839 if (encodep)
6840 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6841 standard = Vstandard_translation_table_for_encode;
6842 else
6843 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6844 standard = Vstandard_translation_table_for_decode;
6845 if (NILP (translation_table))
6846 translation_table = standard;
6847 else
6848 {
6849 if (SYMBOLP (translation_table))
6850 translation_table = Fget (translation_table, Qtranslation_table);
6851 else if (CONSP (translation_table))
6852 {
6853 translation_table = Fcopy_sequence (translation_table);
6854 for (val = translation_table; CONSP (val); val = XCDR (val))
6855 if (SYMBOLP (XCAR (val)))
6856 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6857 }
6858 if (CHAR_TABLE_P (standard))
6859 {
6860 if (CONSP (translation_table))
6861 translation_table = nconc2 (translation_table,
6862 Fcons (standard, Qnil));
6863 else
6864 translation_table = Fcons (translation_table,
6865 Fcons (standard, Qnil));
6866 }
6867 }
6868
6869 if (max_lookup)
6870 {
6871 *max_lookup = 1;
6872 if (CHAR_TABLE_P (translation_table)
6873 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6874 {
6875 val = XCHAR_TABLE (translation_table)->extras[1];
6876 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6877 *max_lookup = XFASTINT (val);
6878 }
6879 else if (CONSP (translation_table))
6880 {
6881 Lisp_Object tail;
6882
6883 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6884 if (CHAR_TABLE_P (XCAR (tail))
6885 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6886 {
6887 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6888 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6889 *max_lookup = XFASTINT (tailval);
6890 }
6891 }
6892 }
6893 return translation_table;
6894 }
6895
6896 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6897 do { \
6898 trans = Qnil; \
6899 if (CHAR_TABLE_P (table)) \
6900 { \
6901 trans = CHAR_TABLE_REF (table, c); \
6902 if (CHARACTERP (trans)) \
6903 c = XFASTINT (trans), trans = Qnil; \
6904 } \
6905 else if (CONSP (table)) \
6906 { \
6907 Lisp_Object tail; \
6908 \
6909 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6910 if (CHAR_TABLE_P (XCAR (tail))) \
6911 { \
6912 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6913 if (CHARACTERP (trans)) \
6914 c = XFASTINT (trans), trans = Qnil; \
6915 else if (! NILP (trans)) \
6916 break; \
6917 } \
6918 } \
6919 } while (0)
6920
6921
6922 /* Return a translation of character(s) at BUF according to TRANS.
6923 TRANS is TO-CHAR or ((FROM . TO) ...) where
6924 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6925 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6926 translation is found, and Qnil if not found..
6927 If BUF is too short to lookup characters in FROM, return Qt. */
6928
6929 static Lisp_Object
6930 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6931 {
6932
6933 if (INTEGERP (trans))
6934 return trans;
6935 for (; CONSP (trans); trans = XCDR (trans))
6936 {
6937 Lisp_Object val = XCAR (trans);
6938 Lisp_Object from = XCAR (val);
6939 ptrdiff_t len = ASIZE (from);
6940 ptrdiff_t i;
6941
6942 for (i = 0; i < len; i++)
6943 {
6944 if (buf + i == buf_end)
6945 return Qt;
6946 if (XINT (AREF (from, i)) != buf[i])
6947 break;
6948 }
6949 if (i == len)
6950 return val;
6951 }
6952 return Qnil;
6953 }
6954
6955
6956 static int
6957 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6958 bool last_block)
6959 {
6960 unsigned char *dst = coding->destination + coding->produced;
6961 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6962 ptrdiff_t produced;
6963 ptrdiff_t produced_chars = 0;
6964 int carryover = 0;
6965
6966 if (! coding->chars_at_source)
6967 {
6968 /* Source characters are in coding->charbuf. */
6969 int *buf = coding->charbuf;
6970 int *buf_end = buf + coding->charbuf_used;
6971
6972 if (EQ (coding->src_object, coding->dst_object))
6973 {
6974 coding_set_source (coding);
6975 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6976 }
6977
6978 while (buf < buf_end)
6979 {
6980 int c = *buf;
6981 ptrdiff_t i;
6982
6983 if (c >= 0)
6984 {
6985 ptrdiff_t from_nchars = 1, to_nchars = 1;
6986 Lisp_Object trans = Qnil;
6987
6988 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6989 if (! NILP (trans))
6990 {
6991 trans = get_translation (trans, buf, buf_end);
6992 if (INTEGERP (trans))
6993 c = XINT (trans);
6994 else if (CONSP (trans))
6995 {
6996 from_nchars = ASIZE (XCAR (trans));
6997 trans = XCDR (trans);
6998 if (INTEGERP (trans))
6999 c = XINT (trans);
7000 else
7001 {
7002 to_nchars = ASIZE (trans);
7003 c = XINT (AREF (trans, 0));
7004 }
7005 }
7006 else if (EQ (trans, Qt) && ! last_block)
7007 break;
7008 }
7009
7010 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7011 {
7012 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7013 / MAX_MULTIBYTE_LENGTH)
7014 < to_nchars)
7015 memory_full (SIZE_MAX);
7016 dst = alloc_destination (coding,
7017 buf_end - buf
7018 + MAX_MULTIBYTE_LENGTH * to_nchars,
7019 dst);
7020 if (EQ (coding->src_object, coding->dst_object))
7021 {
7022 coding_set_source (coding);
7023 dst_end = (((unsigned char *) coding->source)
7024 + coding->consumed);
7025 }
7026 else
7027 dst_end = coding->destination + coding->dst_bytes;
7028 }
7029
7030 for (i = 0; i < to_nchars; i++)
7031 {
7032 if (i > 0)
7033 c = XINT (AREF (trans, i));
7034 if (coding->dst_multibyte
7035 || ! CHAR_BYTE8_P (c))
7036 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7037 else
7038 *dst++ = CHAR_TO_BYTE8 (c);
7039 }
7040 produced_chars += to_nchars;
7041 buf += from_nchars;
7042 }
7043 else
7044 /* This is an annotation datum. (-C) is the length. */
7045 buf += -c;
7046 }
7047 carryover = buf_end - buf;
7048 }
7049 else
7050 {
7051 /* Source characters are at coding->source. */
7052 const unsigned char *src = coding->source;
7053 const unsigned char *src_end = src + coding->consumed;
7054
7055 if (EQ (coding->dst_object, coding->src_object))
7056 dst_end = (unsigned char *) src;
7057 if (coding->src_multibyte != coding->dst_multibyte)
7058 {
7059 if (coding->src_multibyte)
7060 {
7061 bool multibytep = 1;
7062 ptrdiff_t consumed_chars = 0;
7063
7064 while (1)
7065 {
7066 const unsigned char *src_base = src;
7067 int c;
7068
7069 ONE_MORE_BYTE (c);
7070 if (dst == dst_end)
7071 {
7072 if (EQ (coding->src_object, coding->dst_object))
7073 dst_end = (unsigned char *) src;
7074 if (dst == dst_end)
7075 {
7076 ptrdiff_t offset = src - coding->source;
7077
7078 dst = alloc_destination (coding, src_end - src + 1,
7079 dst);
7080 dst_end = coding->destination + coding->dst_bytes;
7081 coding_set_source (coding);
7082 src = coding->source + offset;
7083 src_end = coding->source + coding->consumed;
7084 if (EQ (coding->src_object, coding->dst_object))
7085 dst_end = (unsigned char *) src;
7086 }
7087 }
7088 *dst++ = c;
7089 produced_chars++;
7090 }
7091 no_more_source:
7092 ;
7093 }
7094 else
7095 while (src < src_end)
7096 {
7097 bool multibytep = 1;
7098 int c = *src++;
7099
7100 if (dst >= dst_end - 1)
7101 {
7102 if (EQ (coding->src_object, coding->dst_object))
7103 dst_end = (unsigned char *) src;
7104 if (dst >= dst_end - 1)
7105 {
7106 ptrdiff_t offset = src - coding->source;
7107 ptrdiff_t more_bytes;
7108
7109 if (EQ (coding->src_object, coding->dst_object))
7110 more_bytes = ((src_end - src) / 2) + 2;
7111 else
7112 more_bytes = src_end - src + 2;
7113 dst = alloc_destination (coding, more_bytes, dst);
7114 dst_end = coding->destination + coding->dst_bytes;
7115 coding_set_source (coding);
7116 src = coding->source + offset;
7117 src_end = coding->source + coding->consumed;
7118 if (EQ (coding->src_object, coding->dst_object))
7119 dst_end = (unsigned char *) src;
7120 }
7121 }
7122 EMIT_ONE_BYTE (c);
7123 }
7124 }
7125 else
7126 {
7127 if (!EQ (coding->src_object, coding->dst_object))
7128 {
7129 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7130
7131 if (require > 0)
7132 {
7133 ptrdiff_t offset = src - coding->source;
7134
7135 dst = alloc_destination (coding, require, dst);
7136 coding_set_source (coding);
7137 src = coding->source + offset;
7138 src_end = coding->source + coding->consumed;
7139 }
7140 }
7141 produced_chars = coding->consumed_char;
7142 while (src < src_end)
7143 *dst++ = *src++;
7144 }
7145 }
7146
7147 produced = dst - (coding->destination + coding->produced);
7148 if (BUFFERP (coding->dst_object) && produced_chars > 0)
7149 insert_from_gap (produced_chars, produced, 0);
7150 coding->produced += produced;
7151 coding->produced_char += produced_chars;
7152 return carryover;
7153 }
7154
7155 /* Compose text in CODING->object according to the annotation data at
7156 CHARBUF. CHARBUF is an array:
7157 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7158 */
7159
7160 static void
7161 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7162 {
7163 int len;
7164 ptrdiff_t to;
7165 enum composition_method method;
7166 Lisp_Object components;
7167
7168 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7169 to = pos + charbuf[2];
7170 method = (enum composition_method) (charbuf[4]);
7171
7172 if (method == COMPOSITION_RELATIVE)
7173 components = Qnil;
7174 else
7175 {
7176 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7177 int i, j;
7178
7179 if (method == COMPOSITION_WITH_RULE)
7180 len = charbuf[2] * 3 - 2;
7181 charbuf += MAX_ANNOTATION_LENGTH;
7182 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7183 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7184 {
7185 if (charbuf[i] >= 0)
7186 args[j] = make_number (charbuf[i]);
7187 else
7188 {
7189 i++;
7190 args[j] = make_number (charbuf[i] % 0x100);
7191 }
7192 }
7193 components = (i == j ? Fstring (j, args) : Fvector (j, args));
7194 }
7195 compose_text (pos, to, components, Qnil, coding->dst_object);
7196 }
7197
7198
7199 /* Put `charset' property on text in CODING->object according to
7200 the annotation data at CHARBUF. CHARBUF is an array:
7201 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7202 */
7203
7204 static void
7205 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7206 {
7207 ptrdiff_t from = pos - charbuf[2];
7208 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7209
7210 Fput_text_property (make_number (from), make_number (pos),
7211 Qcharset, CHARSET_NAME (charset),
7212 coding->dst_object);
7213 }
7214
7215
7216 #define CHARBUF_SIZE 0x4000
7217
7218 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7219 do { \
7220 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
7221 coding->charbuf_size = CHARBUF_SIZE; \
7222 } while (0)
7223
7224
7225 static void
7226 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7227 {
7228 int *charbuf = coding->charbuf;
7229 int *charbuf_end = charbuf + coding->charbuf_used;
7230
7231 if (NILP (coding->dst_object))
7232 return;
7233
7234 while (charbuf < charbuf_end)
7235 {
7236 if (*charbuf >= 0)
7237 pos++, charbuf++;
7238 else
7239 {
7240 int len = -*charbuf;
7241
7242 if (len > 2)
7243 switch (charbuf[1])
7244 {
7245 case CODING_ANNOTATE_COMPOSITION_MASK:
7246 produce_composition (coding, charbuf, pos);
7247 break;
7248 case CODING_ANNOTATE_CHARSET_MASK:
7249 produce_charset (coding, charbuf, pos);
7250 break;
7251 }
7252 charbuf += len;
7253 }
7254 }
7255 }
7256
7257 /* Decode the data at CODING->src_object into CODING->dst_object.
7258 CODING->src_object is a buffer, a string, or nil.
7259 CODING->dst_object is a buffer.
7260
7261 If CODING->src_object is a buffer, it must be the current buffer.
7262 In this case, if CODING->src_pos is positive, it is a position of
7263 the source text in the buffer, otherwise, the source text is in the
7264 gap area of the buffer, and CODING->src_pos specifies the offset of
7265 the text from GPT (which must be the same as PT). If this is the
7266 same buffer as CODING->dst_object, CODING->src_pos must be
7267 negative.
7268
7269 If CODING->src_object is a string, CODING->src_pos is an index to
7270 that string.
7271
7272 If CODING->src_object is nil, CODING->source must already point to
7273 the non-relocatable memory area. In this case, CODING->src_pos is
7274 an offset from CODING->source.
7275
7276 The decoded data is inserted at the current point of the buffer
7277 CODING->dst_object.
7278 */
7279
7280 static void
7281 decode_coding (struct coding_system *coding)
7282 {
7283 Lisp_Object attrs;
7284 Lisp_Object undo_list;
7285 Lisp_Object translation_table;
7286 struct ccl_spec cclspec;
7287 int carryover;
7288 int i;
7289
7290 USE_SAFE_ALLOCA;
7291
7292 if (BUFFERP (coding->src_object)
7293 && coding->src_pos > 0
7294 && coding->src_pos < GPT
7295 && coding->src_pos + coding->src_chars > GPT)
7296 move_gap_both (coding->src_pos, coding->src_pos_byte);
7297
7298 undo_list = Qt;
7299 if (BUFFERP (coding->dst_object))
7300 {
7301 set_buffer_internal (XBUFFER (coding->dst_object));
7302 if (GPT != PT)
7303 move_gap_both (PT, PT_BYTE);
7304
7305 /* We must disable undo_list in order to record the whole insert
7306 transaction via record_insert at the end. But doing so also
7307 disables the recording of the first change to the undo_list.
7308 Therefore we check for first change here and record it via
7309 record_first_change if needed. */
7310 if (MODIFF <= SAVE_MODIFF)
7311 record_first_change ();
7312
7313 undo_list = BVAR (current_buffer, undo_list);
7314 bset_undo_list (current_buffer, Qt);
7315 }
7316
7317 coding->consumed = coding->consumed_char = 0;
7318 coding->produced = coding->produced_char = 0;
7319 coding->chars_at_source = 0;
7320 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7321 coding->errors = 0;
7322
7323 ALLOC_CONVERSION_WORK_AREA (coding);
7324
7325 attrs = CODING_ID_ATTRS (coding->id);
7326 translation_table = get_translation_table (attrs, 0, NULL);
7327
7328 carryover = 0;
7329 if (coding->decoder == decode_coding_ccl)
7330 {
7331 coding->spec.ccl = &cclspec;
7332 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7333 }
7334 do
7335 {
7336 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7337
7338 coding_set_source (coding);
7339 coding->annotated = 0;
7340 coding->charbuf_used = carryover;
7341 (*(coding->decoder)) (coding);
7342 coding_set_destination (coding);
7343 carryover = produce_chars (coding, translation_table, 0);
7344 if (coding->annotated)
7345 produce_annotation (coding, pos);
7346 for (i = 0; i < carryover; i++)
7347 coding->charbuf[i]
7348 = coding->charbuf[coding->charbuf_used - carryover + i];
7349 }
7350 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7351 || (coding->consumed < coding->src_bytes
7352 && (coding->result == CODING_RESULT_SUCCESS
7353 || coding->result == CODING_RESULT_INVALID_SRC)));
7354
7355 if (carryover > 0)
7356 {
7357 coding_set_destination (coding);
7358 coding->charbuf_used = carryover;
7359 produce_chars (coding, translation_table, 1);
7360 }
7361
7362 coding->carryover_bytes = 0;
7363 if (coding->consumed < coding->src_bytes)
7364 {
7365 int nbytes = coding->src_bytes - coding->consumed;
7366 const unsigned char *src;
7367
7368 coding_set_source (coding);
7369 coding_set_destination (coding);
7370 src = coding->source + coding->consumed;
7371
7372 if (coding->mode & CODING_MODE_LAST_BLOCK)
7373 {
7374 /* Flush out unprocessed data as binary chars. We are sure
7375 that the number of data is less than the size of
7376 coding->charbuf. */
7377 coding->charbuf_used = 0;
7378 coding->chars_at_source = 0;
7379
7380 while (nbytes-- > 0)
7381 {
7382 int c = *src++;
7383
7384 if (c & 0x80)
7385 c = BYTE8_TO_CHAR (c);
7386 coding->charbuf[coding->charbuf_used++] = c;
7387 }
7388 produce_chars (coding, Qnil, 1);
7389 }
7390 else
7391 {
7392 /* Record unprocessed bytes in coding->carryover. We are
7393 sure that the number of data is less than the size of
7394 coding->carryover. */
7395 unsigned char *p = coding->carryover;
7396
7397 if (nbytes > sizeof coding->carryover)
7398 nbytes = sizeof coding->carryover;
7399 coding->carryover_bytes = nbytes;
7400 while (nbytes-- > 0)
7401 *p++ = *src++;
7402 }
7403 coding->consumed = coding->src_bytes;
7404 }
7405
7406 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7407 && !inhibit_eol_conversion)
7408 decode_eol (coding);
7409 if (BUFFERP (coding->dst_object))
7410 {
7411 bset_undo_list (current_buffer, undo_list);
7412 record_insert (coding->dst_pos, coding->produced_char);
7413 }
7414
7415 SAFE_FREE ();
7416 }
7417
7418
7419 /* Extract an annotation datum from a composition starting at POS and
7420 ending before LIMIT of CODING->src_object (buffer or string), store
7421 the data in BUF, set *STOP to a starting position of the next
7422 composition (if any) or to LIMIT, and return the address of the
7423 next element of BUF.
7424
7425 If such an annotation is not found, set *STOP to a starting
7426 position of a composition after POS (if any) or to LIMIT, and
7427 return BUF. */
7428
7429 static int *
7430 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7431 struct coding_system *coding, int *buf,
7432 ptrdiff_t *stop)
7433 {
7434 ptrdiff_t start, end;
7435 Lisp_Object prop;
7436
7437 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7438 || end > limit)
7439 *stop = limit;
7440 else if (start > pos)
7441 *stop = start;
7442 else
7443 {
7444 if (start == pos)
7445 {
7446 /* We found a composition. Store the corresponding
7447 annotation data in BUF. */
7448 int *head = buf;
7449 enum composition_method method = COMPOSITION_METHOD (prop);
7450 int nchars = COMPOSITION_LENGTH (prop);
7451
7452 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7453 if (method != COMPOSITION_RELATIVE)
7454 {
7455 Lisp_Object components;
7456 ptrdiff_t i, len, i_byte;
7457
7458 components = COMPOSITION_COMPONENTS (prop);
7459 if (VECTORP (components))
7460 {
7461 len = ASIZE (components);
7462 for (i = 0; i < len; i++)
7463 *buf++ = XINT (AREF (components, i));
7464 }
7465 else if (STRINGP (components))
7466 {
7467 len = SCHARS (components);
7468 i = i_byte = 0;
7469 while (i < len)
7470 {
7471 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7472 buf++;
7473 }
7474 }
7475 else if (INTEGERP (components))
7476 {
7477 len = 1;
7478 *buf++ = XINT (components);
7479 }
7480 else if (CONSP (components))
7481 {
7482 for (len = 0; CONSP (components);
7483 len++, components = XCDR (components))
7484 *buf++ = XINT (XCAR (components));
7485 }
7486 else
7487 emacs_abort ();
7488 *head -= len;
7489 }
7490 }
7491
7492 if (find_composition (end, limit, &start, &end, &prop,
7493 coding->src_object)
7494 && end <= limit)
7495 *stop = start;
7496 else
7497 *stop = limit;
7498 }
7499 return buf;
7500 }
7501
7502
7503 /* Extract an annotation datum from a text property `charset' at POS of
7504 CODING->src_object (buffer of string), store the data in BUF, set
7505 *STOP to the position where the value of `charset' property changes
7506 (limiting by LIMIT), and return the address of the next element of
7507 BUF.
7508
7509 If the property value is nil, set *STOP to the position where the
7510 property value is non-nil (limiting by LIMIT), and return BUF. */
7511
7512 static int *
7513 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7514 struct coding_system *coding, int *buf,
7515 ptrdiff_t *stop)
7516 {
7517 Lisp_Object val, next;
7518 int id;
7519
7520 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7521 if (! NILP (val) && CHARSETP (val))
7522 id = XINT (CHARSET_SYMBOL_ID (val));
7523 else
7524 id = -1;
7525 ADD_CHARSET_DATA (buf, 0, id);
7526 next = Fnext_single_property_change (make_number (pos), Qcharset,
7527 coding->src_object,
7528 make_number (limit));
7529 *stop = XINT (next);
7530 return buf;
7531 }
7532
7533
7534 static void
7535 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7536 int max_lookup)
7537 {
7538 int *buf = coding->charbuf;
7539 int *buf_end = coding->charbuf + coding->charbuf_size;
7540 const unsigned char *src = coding->source + coding->consumed;
7541 const unsigned char *src_end = coding->source + coding->src_bytes;
7542 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7543 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7544 bool multibytep = coding->src_multibyte;
7545 Lisp_Object eol_type;
7546 int c;
7547 ptrdiff_t stop, stop_composition, stop_charset;
7548 int *lookup_buf = NULL;
7549
7550 if (! NILP (translation_table))
7551 lookup_buf = alloca (sizeof (int) * max_lookup);
7552
7553 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7554 if (VECTORP (eol_type))
7555 eol_type = Qunix;
7556
7557 /* Note: composition handling is not yet implemented. */
7558 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7559
7560 if (NILP (coding->src_object))
7561 stop = stop_composition = stop_charset = end_pos;
7562 else
7563 {
7564 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7565 stop = stop_composition = pos;
7566 else
7567 stop = stop_composition = end_pos;
7568 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7569 stop = stop_charset = pos;
7570 else
7571 stop_charset = end_pos;
7572 }
7573
7574 /* Compensate for CRLF and conversion. */
7575 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7576 while (buf < buf_end)
7577 {
7578 Lisp_Object trans;
7579
7580 if (pos == stop)
7581 {
7582 if (pos == end_pos)
7583 break;
7584 if (pos == stop_composition)
7585 buf = handle_composition_annotation (pos, end_pos, coding,
7586 buf, &stop_composition);
7587 if (pos == stop_charset)
7588 buf = handle_charset_annotation (pos, end_pos, coding,
7589 buf, &stop_charset);
7590 stop = (stop_composition < stop_charset
7591 ? stop_composition : stop_charset);
7592 }
7593
7594 if (! multibytep)
7595 {
7596 int bytes;
7597
7598 if (coding->encoder == encode_coding_raw_text
7599 || coding->encoder == encode_coding_ccl)
7600 c = *src++, pos++;
7601 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7602 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7603 else
7604 c = BYTE8_TO_CHAR (*src), src++, pos++;
7605 }
7606 else
7607 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7608 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7609 c = '\n';
7610 if (! EQ (eol_type, Qunix))
7611 {
7612 if (c == '\n')
7613 {
7614 if (EQ (eol_type, Qdos))
7615 *buf++ = '\r';
7616 else
7617 c = '\r';
7618 }
7619 }
7620
7621 trans = Qnil;
7622 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7623 if (NILP (trans))
7624 *buf++ = c;
7625 else
7626 {
7627 ptrdiff_t from_nchars = 1, to_nchars = 1;
7628 int *lookup_buf_end;
7629 const unsigned char *p = src;
7630 int i;
7631
7632 lookup_buf[0] = c;
7633 for (i = 1; i < max_lookup && p < src_end; i++)
7634 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7635 lookup_buf_end = lookup_buf + i;
7636 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7637 if (INTEGERP (trans))
7638 c = XINT (trans);
7639 else if (CONSP (trans))
7640 {
7641 from_nchars = ASIZE (XCAR (trans));
7642 trans = XCDR (trans);
7643 if (INTEGERP (trans))
7644 c = XINT (trans);
7645 else
7646 {
7647 to_nchars = ASIZE (trans);
7648 if (buf_end - buf < to_nchars)
7649 break;
7650 c = XINT (AREF (trans, 0));
7651 }
7652 }
7653 else
7654 break;
7655 *buf++ = c;
7656 for (i = 1; i < to_nchars; i++)
7657 *buf++ = XINT (AREF (trans, i));
7658 for (i = 1; i < from_nchars; i++, pos++)
7659 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7660 }
7661 }
7662
7663 coding->consumed = src - coding->source;
7664 coding->consumed_char = pos - coding->src_pos;
7665 coding->charbuf_used = buf - coding->charbuf;
7666 coding->chars_at_source = 0;
7667 }
7668
7669
7670 /* Encode the text at CODING->src_object into CODING->dst_object.
7671 CODING->src_object is a buffer or a string.
7672 CODING->dst_object is a buffer or nil.
7673
7674 If CODING->src_object is a buffer, it must be the current buffer.
7675 In this case, if CODING->src_pos is positive, it is a position of
7676 the source text in the buffer, otherwise. the source text is in the
7677 gap area of the buffer, and coding->src_pos specifies the offset of
7678 the text from GPT (which must be the same as PT). If this is the
7679 same buffer as CODING->dst_object, CODING->src_pos must be
7680 negative and CODING should not have `pre-write-conversion'.
7681
7682 If CODING->src_object is a string, CODING should not have
7683 `pre-write-conversion'.
7684
7685 If CODING->dst_object is a buffer, the encoded data is inserted at
7686 the current point of that buffer.
7687
7688 If CODING->dst_object is nil, the encoded data is placed at the
7689 memory area specified by CODING->destination. */
7690
7691 static void
7692 encode_coding (struct coding_system *coding)
7693 {
7694 Lisp_Object attrs;
7695 Lisp_Object translation_table;
7696 int max_lookup;
7697 struct ccl_spec cclspec;
7698
7699 USE_SAFE_ALLOCA;
7700
7701 attrs = CODING_ID_ATTRS (coding->id);
7702 if (coding->encoder == encode_coding_raw_text)
7703 translation_table = Qnil, max_lookup = 0;
7704 else
7705 translation_table = get_translation_table (attrs, 1, &max_lookup);
7706
7707 if (BUFFERP (coding->dst_object))
7708 {
7709 set_buffer_internal (XBUFFER (coding->dst_object));
7710 coding->dst_multibyte
7711 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7712 }
7713
7714 coding->consumed = coding->consumed_char = 0;
7715 coding->produced = coding->produced_char = 0;
7716 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7717 coding->errors = 0;
7718
7719 ALLOC_CONVERSION_WORK_AREA (coding);
7720
7721 if (coding->encoder == encode_coding_ccl)
7722 {
7723 coding->spec.ccl = &cclspec;
7724 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7725 }
7726 do {
7727 coding_set_source (coding);
7728 consume_chars (coding, translation_table, max_lookup);
7729 coding_set_destination (coding);
7730 (*(coding->encoder)) (coding);
7731 } while (coding->consumed_char < coding->src_chars);
7732
7733 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7734 insert_from_gap (coding->produced_char, coding->produced, 0);
7735
7736 SAFE_FREE ();
7737 }
7738
7739
7740 /* Name (or base name) of work buffer for code conversion. */
7741 static Lisp_Object Vcode_conversion_workbuf_name;
7742
7743 /* A working buffer used by the top level conversion. Once it is
7744 created, it is never destroyed. It has the name
7745 Vcode_conversion_workbuf_name. The other working buffers are
7746 destroyed after the use is finished, and their names are modified
7747 versions of Vcode_conversion_workbuf_name. */
7748 static Lisp_Object Vcode_conversion_reused_workbuf;
7749
7750 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7751 static bool reused_workbuf_in_use;
7752
7753
7754 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7755 multibyteness of returning buffer. */
7756
7757 static Lisp_Object
7758 make_conversion_work_buffer (bool multibyte)
7759 {
7760 Lisp_Object name, workbuf;
7761 struct buffer *current;
7762
7763 if (reused_workbuf_in_use)
7764 {
7765 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7766 workbuf = Fget_buffer_create (name);
7767 }
7768 else
7769 {
7770 reused_workbuf_in_use = 1;
7771 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7772 Vcode_conversion_reused_workbuf
7773 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7774 workbuf = Vcode_conversion_reused_workbuf;
7775 }
7776 current = current_buffer;
7777 set_buffer_internal (XBUFFER (workbuf));
7778 /* We can't allow modification hooks to run in the work buffer. For
7779 instance, directory_files_internal assumes that file decoding
7780 doesn't compile new regexps. */
7781 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7782 Ferase_buffer ();
7783 bset_undo_list (current_buffer, Qt);
7784 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7785 set_buffer_internal (current);
7786 return workbuf;
7787 }
7788
7789
7790 static Lisp_Object
7791 code_conversion_restore (Lisp_Object arg)
7792 {
7793 Lisp_Object current, workbuf;
7794 struct gcpro gcpro1;
7795
7796 GCPRO1 (arg);
7797 current = XCAR (arg);
7798 workbuf = XCDR (arg);
7799 if (! NILP (workbuf))
7800 {
7801 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7802 reused_workbuf_in_use = 0;
7803 else
7804 Fkill_buffer (workbuf);
7805 }
7806 set_buffer_internal (XBUFFER (current));
7807 UNGCPRO;
7808 return Qnil;
7809 }
7810
7811 Lisp_Object
7812 code_conversion_save (bool with_work_buf, bool multibyte)
7813 {
7814 Lisp_Object workbuf = Qnil;
7815
7816 if (with_work_buf)
7817 workbuf = make_conversion_work_buffer (multibyte);
7818 record_unwind_protect (code_conversion_restore,
7819 Fcons (Fcurrent_buffer (), workbuf));
7820 return workbuf;
7821 }
7822
7823 void
7824 decode_coding_gap (struct coding_system *coding,
7825 ptrdiff_t chars, ptrdiff_t bytes)
7826 {
7827 ptrdiff_t count = SPECPDL_INDEX ();
7828 Lisp_Object attrs;
7829
7830 coding->src_object = Fcurrent_buffer ();
7831 coding->src_chars = chars;
7832 coding->src_bytes = bytes;
7833 coding->src_pos = -chars;
7834 coding->src_pos_byte = -bytes;
7835 coding->src_multibyte = chars < bytes;
7836 coding->dst_object = coding->src_object;
7837 coding->dst_pos = PT;
7838 coding->dst_pos_byte = PT_BYTE;
7839 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7840
7841 coding->head_ascii = -1;
7842 coding->detected_utf8_chars = -1;
7843 coding->eol_seen = EOL_SEEN_NONE;
7844 if (CODING_REQUIRE_DETECTION (coding))
7845 detect_coding (coding);
7846 attrs = CODING_ID_ATTRS (coding->id);
7847 if (! disable_ascii_optimization
7848 && ! coding->src_multibyte
7849 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7850 && NILP (CODING_ATTR_POST_READ (attrs))
7851 && NILP (get_translation_table (attrs, 0, NULL)))
7852 {
7853 chars = coding->head_ascii;
7854 if (chars < 0)
7855 chars = check_ascii (coding);
7856 if (chars != bytes)
7857 {
7858 /* There exists a non-ASCII byte. */
7859 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7860 {
7861 if (coding->detected_utf8_chars >= 0)
7862 chars = coding->detected_utf8_chars;
7863 else
7864 chars = check_utf_8 (coding);
7865 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7866 && coding->head_ascii == 0
7867 && coding->source[0] == UTF_8_BOM_1
7868 && coding->source[1] == UTF_8_BOM_2
7869 && coding->source[2] == UTF_8_BOM_3)
7870 {
7871 chars--;
7872 bytes -= 3;
7873 coding->src_bytes -= 3;
7874 }
7875 }
7876 else
7877 chars = -1;
7878 }
7879 if (chars >= 0)
7880 {
7881 Lisp_Object eol_type;
7882
7883 eol_type = CODING_ID_EOL_TYPE (coding->id);
7884 if (VECTORP (eol_type))
7885 {
7886 if (coding->eol_seen != EOL_SEEN_NONE)
7887 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7888 }
7889 if (EQ (eol_type, Qmac))
7890 {
7891 unsigned char *src_end = GAP_END_ADDR;
7892 unsigned char *src = src_end - coding->src_bytes;
7893
7894 while (src < src_end)
7895 {
7896 if (*src++ == '\r')
7897 src[-1] = '\n';
7898 }
7899 }
7900 else if (EQ (eol_type, Qdos))
7901 {
7902 unsigned char *src = GAP_END_ADDR;
7903 unsigned char *src_beg = src - coding->src_bytes;
7904 unsigned char *dst = src;
7905 ptrdiff_t diff;
7906
7907 while (src_beg < src)
7908 {
7909 *--dst = *--src;
7910 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7911 src--;
7912 }
7913 diff = dst - src;
7914 bytes -= diff;
7915 chars -= diff;
7916 }
7917 coding->produced = bytes;
7918 coding->produced_char = chars;
7919 insert_from_gap (chars, bytes, 1);
7920 return;
7921 }
7922 }
7923 code_conversion_save (0, 0);
7924
7925 coding->mode |= CODING_MODE_LAST_BLOCK;
7926 current_buffer->text->inhibit_shrinking = 1;
7927 decode_coding (coding);
7928 current_buffer->text->inhibit_shrinking = 0;
7929
7930 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7931 {
7932 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7933 Lisp_Object val;
7934
7935 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7936 val = call1 (CODING_ATTR_POST_READ (attrs),
7937 make_number (coding->produced_char));
7938 CHECK_NATNUM (val);
7939 coding->produced_char += Z - prev_Z;
7940 coding->produced += Z_BYTE - prev_Z_BYTE;
7941 }
7942
7943 unbind_to (count, Qnil);
7944 }
7945
7946
7947 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7948 SRC_OBJECT into DST_OBJECT by coding context CODING.
7949
7950 SRC_OBJECT is a buffer, a string, or Qnil.
7951
7952 If it is a buffer, the text is at point of the buffer. FROM and TO
7953 are positions in the buffer.
7954
7955 If it is a string, the text is at the beginning of the string.
7956 FROM and TO are indices to the string.
7957
7958 If it is nil, the text is at coding->source. FROM and TO are
7959 indices to coding->source.
7960
7961 DST_OBJECT is a buffer, Qt, or Qnil.
7962
7963 If it is a buffer, the decoded text is inserted at point of the
7964 buffer. If the buffer is the same as SRC_OBJECT, the source text
7965 is deleted.
7966
7967 If it is Qt, a string is made from the decoded text, and
7968 set in CODING->dst_object.
7969
7970 If it is Qnil, the decoded text is stored at CODING->destination.
7971 The caller must allocate CODING->dst_bytes bytes at
7972 CODING->destination by xmalloc. If the decoded text is longer than
7973 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7974 */
7975
7976 void
7977 decode_coding_object (struct coding_system *coding,
7978 Lisp_Object src_object,
7979 ptrdiff_t from, ptrdiff_t from_byte,
7980 ptrdiff_t to, ptrdiff_t to_byte,
7981 Lisp_Object dst_object)
7982 {
7983 ptrdiff_t count = SPECPDL_INDEX ();
7984 unsigned char *destination IF_LINT (= NULL);
7985 ptrdiff_t dst_bytes IF_LINT (= 0);
7986 ptrdiff_t chars = to - from;
7987 ptrdiff_t bytes = to_byte - from_byte;
7988 Lisp_Object attrs;
7989 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7990 bool need_marker_adjustment = 0;
7991 Lisp_Object old_deactivate_mark;
7992
7993 old_deactivate_mark = Vdeactivate_mark;
7994
7995 if (NILP (dst_object))
7996 {
7997 destination = coding->destination;
7998 dst_bytes = coding->dst_bytes;
7999 }
8000
8001 coding->src_object = src_object;
8002 coding->src_chars = chars;
8003 coding->src_bytes = bytes;
8004 coding->src_multibyte = chars < bytes;
8005
8006 if (STRINGP (src_object))
8007 {
8008 coding->src_pos = from;
8009 coding->src_pos_byte = from_byte;
8010 }
8011 else if (BUFFERP (src_object))
8012 {
8013 set_buffer_internal (XBUFFER (src_object));
8014 if (from != GPT)
8015 move_gap_both (from, from_byte);
8016 if (EQ (src_object, dst_object))
8017 {
8018 struct Lisp_Marker *tail;
8019
8020 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8021 {
8022 tail->need_adjustment
8023 = tail->charpos == (tail->insertion_type ? from : to);
8024 need_marker_adjustment |= tail->need_adjustment;
8025 }
8026 saved_pt = PT, saved_pt_byte = PT_BYTE;
8027 TEMP_SET_PT_BOTH (from, from_byte);
8028 current_buffer->text->inhibit_shrinking = 1;
8029 del_range_both (from, from_byte, to, to_byte, 1);
8030 coding->src_pos = -chars;
8031 coding->src_pos_byte = -bytes;
8032 }
8033 else
8034 {
8035 coding->src_pos = from;
8036 coding->src_pos_byte = from_byte;
8037 }
8038 }
8039
8040 if (CODING_REQUIRE_DETECTION (coding))
8041 detect_coding (coding);
8042 attrs = CODING_ID_ATTRS (coding->id);
8043
8044 if (EQ (dst_object, Qt)
8045 || (! NILP (CODING_ATTR_POST_READ (attrs))
8046 && NILP (dst_object)))
8047 {
8048 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8049 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8050 coding->dst_pos = BEG;
8051 coding->dst_pos_byte = BEG_BYTE;
8052 }
8053 else if (BUFFERP (dst_object))
8054 {
8055 code_conversion_save (0, 0);
8056 coding->dst_object = dst_object;
8057 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8058 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8059 coding->dst_multibyte
8060 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8061 }
8062 else
8063 {
8064 code_conversion_save (0, 0);
8065 coding->dst_object = Qnil;
8066 /* Most callers presume this will return a multibyte result, and they
8067 won't use `binary' or `raw-text' anyway, so let's not worry about
8068 CODING_FOR_UNIBYTE. */
8069 coding->dst_multibyte = 1;
8070 }
8071
8072 decode_coding (coding);
8073
8074 if (BUFFERP (coding->dst_object))
8075 set_buffer_internal (XBUFFER (coding->dst_object));
8076
8077 if (! NILP (CODING_ATTR_POST_READ (attrs)))
8078 {
8079 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8080 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8081 Lisp_Object val;
8082
8083 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8084 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8085 old_deactivate_mark);
8086 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8087 make_number (coding->produced_char));
8088 UNGCPRO;
8089 CHECK_NATNUM (val);
8090 coding->produced_char += Z - prev_Z;
8091 coding->produced += Z_BYTE - prev_Z_BYTE;
8092 }
8093
8094 if (EQ (dst_object, Qt))
8095 {
8096 coding->dst_object = Fbuffer_string ();
8097 }
8098 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8099 {
8100 set_buffer_internal (XBUFFER (coding->dst_object));
8101 if (dst_bytes < coding->produced)
8102 {
8103 eassert (coding->produced > 0);
8104 destination = xrealloc (destination, coding->produced);
8105 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8106 move_gap_both (BEGV, BEGV_BYTE);
8107 memcpy (destination, BEGV_ADDR, coding->produced);
8108 coding->destination = destination;
8109 }
8110 }
8111
8112 if (saved_pt >= 0)
8113 {
8114 /* This is the case of:
8115 (BUFFERP (src_object) && EQ (src_object, dst_object))
8116 As we have moved PT while replacing the original buffer
8117 contents, we must recover it now. */
8118 set_buffer_internal (XBUFFER (src_object));
8119 current_buffer->text->inhibit_shrinking = 0;
8120 if (saved_pt < from)
8121 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8122 else if (saved_pt < from + chars)
8123 TEMP_SET_PT_BOTH (from, from_byte);
8124 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8125 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8126 saved_pt_byte + (coding->produced - bytes));
8127 else
8128 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8129 saved_pt_byte + (coding->produced - bytes));
8130
8131 if (need_marker_adjustment)
8132 {
8133 struct Lisp_Marker *tail;
8134
8135 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8136 if (tail->need_adjustment)
8137 {
8138 tail->need_adjustment = 0;
8139 if (tail->insertion_type)
8140 {
8141 tail->bytepos = from_byte;
8142 tail->charpos = from;
8143 }
8144 else
8145 {
8146 tail->bytepos = from_byte + coding->produced;
8147 tail->charpos
8148 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8149 ? tail->bytepos : from + coding->produced_char);
8150 }
8151 }
8152 }
8153 }
8154
8155 Vdeactivate_mark = old_deactivate_mark;
8156 unbind_to (count, coding->dst_object);
8157 }
8158
8159
8160 void
8161 encode_coding_object (struct coding_system *coding,
8162 Lisp_Object src_object,
8163 ptrdiff_t from, ptrdiff_t from_byte,
8164 ptrdiff_t to, ptrdiff_t to_byte,
8165 Lisp_Object dst_object)
8166 {
8167 ptrdiff_t count = SPECPDL_INDEX ();
8168 ptrdiff_t chars = to - from;
8169 ptrdiff_t bytes = to_byte - from_byte;
8170 Lisp_Object attrs;
8171 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8172 bool need_marker_adjustment = 0;
8173 bool kill_src_buffer = 0;
8174 Lisp_Object old_deactivate_mark;
8175
8176 old_deactivate_mark = Vdeactivate_mark;
8177
8178 coding->src_object = src_object;
8179 coding->src_chars = chars;
8180 coding->src_bytes = bytes;
8181 coding->src_multibyte = chars < bytes;
8182
8183 attrs = CODING_ID_ATTRS (coding->id);
8184
8185 if (EQ (src_object, dst_object))
8186 {
8187 struct Lisp_Marker *tail;
8188
8189 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8190 {
8191 tail->need_adjustment
8192 = tail->charpos == (tail->insertion_type ? from : to);
8193 need_marker_adjustment |= tail->need_adjustment;
8194 }
8195 }
8196
8197 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8198 {
8199 coding->src_object = code_conversion_save (1, coding->src_multibyte);
8200 set_buffer_internal (XBUFFER (coding->src_object));
8201 if (STRINGP (src_object))
8202 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8203 else if (BUFFERP (src_object))
8204 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8205 else
8206 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8207
8208 if (EQ (src_object, dst_object))
8209 {
8210 set_buffer_internal (XBUFFER (src_object));
8211 saved_pt = PT, saved_pt_byte = PT_BYTE;
8212 del_range_both (from, from_byte, to, to_byte, 1);
8213 set_buffer_internal (XBUFFER (coding->src_object));
8214 }
8215
8216 {
8217 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8218
8219 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8220 old_deactivate_mark);
8221 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8222 make_number (BEG), make_number (Z));
8223 UNGCPRO;
8224 }
8225 if (XBUFFER (coding->src_object) != current_buffer)
8226 kill_src_buffer = 1;
8227 coding->src_object = Fcurrent_buffer ();
8228 if (BEG != GPT)
8229 move_gap_both (BEG, BEG_BYTE);
8230 coding->src_chars = Z - BEG;
8231 coding->src_bytes = Z_BYTE - BEG_BYTE;
8232 coding->src_pos = BEG;
8233 coding->src_pos_byte = BEG_BYTE;
8234 coding->src_multibyte = Z < Z_BYTE;
8235 }
8236 else if (STRINGP (src_object))
8237 {
8238 code_conversion_save (0, 0);
8239 coding->src_pos = from;
8240 coding->src_pos_byte = from_byte;
8241 }
8242 else if (BUFFERP (src_object))
8243 {
8244 code_conversion_save (0, 0);
8245 set_buffer_internal (XBUFFER (src_object));
8246 if (EQ (src_object, dst_object))
8247 {
8248 saved_pt = PT, saved_pt_byte = PT_BYTE;
8249 coding->src_object = del_range_1 (from, to, 1, 1);
8250 coding->src_pos = 0;
8251 coding->src_pos_byte = 0;
8252 }
8253 else
8254 {
8255 if (from < GPT && to >= GPT)
8256 move_gap_both (from, from_byte);
8257 coding->src_pos = from;
8258 coding->src_pos_byte = from_byte;
8259 }
8260 }
8261 else
8262 code_conversion_save (0, 0);
8263
8264 if (BUFFERP (dst_object))
8265 {
8266 coding->dst_object = dst_object;
8267 if (EQ (src_object, dst_object))
8268 {
8269 coding->dst_pos = from;
8270 coding->dst_pos_byte = from_byte;
8271 }
8272 else
8273 {
8274 struct buffer *current = current_buffer;
8275
8276 set_buffer_temp (XBUFFER (dst_object));
8277 coding->dst_pos = PT;
8278 coding->dst_pos_byte = PT_BYTE;
8279 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8280 set_buffer_temp (current);
8281 }
8282 coding->dst_multibyte
8283 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8284 }
8285 else if (EQ (dst_object, Qt))
8286 {
8287 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8288 coding->dst_object = Qnil;
8289 coding->destination = xmalloc (dst_bytes);
8290 coding->dst_bytes = dst_bytes;
8291 coding->dst_multibyte = 0;
8292 }
8293 else
8294 {
8295 coding->dst_object = Qnil;
8296 coding->dst_multibyte = 0;
8297 }
8298
8299 encode_coding (coding);
8300
8301 if (EQ (dst_object, Qt))
8302 {
8303 if (BUFFERP (coding->dst_object))
8304 coding->dst_object = Fbuffer_string ();
8305 else
8306 {
8307 coding->dst_object
8308 = make_unibyte_string ((char *) coding->destination,
8309 coding->produced);
8310 xfree (coding->destination);
8311 }
8312 }
8313
8314 if (saved_pt >= 0)
8315 {
8316 /* This is the case of:
8317 (BUFFERP (src_object) && EQ (src_object, dst_object))
8318 As we have moved PT while replacing the original buffer
8319 contents, we must recover it now. */
8320 set_buffer_internal (XBUFFER (src_object));
8321 if (saved_pt < from)
8322 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8323 else if (saved_pt < from + chars)
8324 TEMP_SET_PT_BOTH (from, from_byte);
8325 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8326 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8327 saved_pt_byte + (coding->produced - bytes));
8328 else
8329 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8330 saved_pt_byte + (coding->produced - bytes));
8331
8332 if (need_marker_adjustment)
8333 {
8334 struct Lisp_Marker *tail;
8335
8336 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8337 if (tail->need_adjustment)
8338 {
8339 tail->need_adjustment = 0;
8340 if (tail->insertion_type)
8341 {
8342 tail->bytepos = from_byte;
8343 tail->charpos = from;
8344 }
8345 else
8346 {
8347 tail->bytepos = from_byte + coding->produced;
8348 tail->charpos
8349 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8350 ? tail->bytepos : from + coding->produced_char);
8351 }
8352 }
8353 }
8354 }
8355
8356 if (kill_src_buffer)
8357 Fkill_buffer (coding->src_object);
8358
8359 Vdeactivate_mark = old_deactivate_mark;
8360 unbind_to (count, Qnil);
8361 }
8362
8363
8364 Lisp_Object
8365 preferred_coding_system (void)
8366 {
8367 int id = coding_categories[coding_priorities[0]].id;
8368
8369 return CODING_ID_NAME (id);
8370 }
8371
8372 #if defined (WINDOWSNT) || defined (CYGWIN)
8373
8374 Lisp_Object
8375 from_unicode (Lisp_Object str)
8376 {
8377 CHECK_STRING (str);
8378 if (!STRING_MULTIBYTE (str) &&
8379 SBYTES (str) & 1)
8380 {
8381 str = Fsubstring (str, make_number (0), make_number (-1));
8382 }
8383
8384 return code_convert_string_norecord (str, Qutf_16le, 0);
8385 }
8386
8387 Lisp_Object
8388 from_unicode_buffer (const wchar_t* wstr)
8389 {
8390 return from_unicode (
8391 make_unibyte_string (
8392 (char*) wstr,
8393 /* we get one of the two final 0 bytes for free. */
8394 1 + sizeof (wchar_t) * wcslen (wstr)));
8395 }
8396
8397 wchar_t *
8398 to_unicode (Lisp_Object str, Lisp_Object *buf)
8399 {
8400 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8401 /* We need to make another copy (in addition to the one made by
8402 code_convert_string_norecord) to ensure that the final string is
8403 _doubly_ zero terminated --- that is, that the string is
8404 terminated by two zero bytes and one utf-16le null character.
8405 Because strings are already terminated with a single zero byte,
8406 we just add one additional zero. */
8407 str = make_uninit_string (SBYTES (*buf) + 1);
8408 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8409 SDATA (str) [SBYTES (*buf)] = '\0';
8410 *buf = str;
8411 return WCSDATA (*buf);
8412 }
8413
8414 #endif /* WINDOWSNT || CYGWIN */
8415
8416 \f
8417 #ifdef emacs
8418 /*** 8. Emacs Lisp library functions ***/
8419
8420 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8421 doc: /* Return t if OBJECT is nil or a coding-system.
8422 See the documentation of `define-coding-system' for information
8423 about coding-system objects. */)
8424 (Lisp_Object object)
8425 {
8426 if (NILP (object)
8427 || CODING_SYSTEM_ID (object) >= 0)
8428 return Qt;
8429 if (! SYMBOLP (object)
8430 || NILP (Fget (object, Qcoding_system_define_form)))
8431 return Qnil;
8432 return Qt;
8433 }
8434
8435 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8436 Sread_non_nil_coding_system, 1, 1, 0,
8437 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8438 (Lisp_Object prompt)
8439 {
8440 Lisp_Object val;
8441 do
8442 {
8443 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8444 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8445 }
8446 while (SCHARS (val) == 0);
8447 return (Fintern (val, Qnil));
8448 }
8449
8450 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8451 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8452 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8453 Ignores case when completing coding systems (all Emacs coding systems
8454 are lower-case). */)
8455 (Lisp_Object prompt, Lisp_Object default_coding_system)
8456 {
8457 Lisp_Object val;
8458 ptrdiff_t count = SPECPDL_INDEX ();
8459
8460 if (SYMBOLP (default_coding_system))
8461 default_coding_system = SYMBOL_NAME (default_coding_system);
8462 specbind (Qcompletion_ignore_case, Qt);
8463 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8464 Qt, Qnil, Qcoding_system_history,
8465 default_coding_system, Qnil);
8466 unbind_to (count, Qnil);
8467 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8468 }
8469
8470 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8471 1, 1, 0,
8472 doc: /* Check validity of CODING-SYSTEM.
8473 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8474 It is valid if it is nil or a symbol defined as a coding system by the
8475 function `define-coding-system'. */)
8476 (Lisp_Object coding_system)
8477 {
8478 Lisp_Object define_form;
8479
8480 define_form = Fget (coding_system, Qcoding_system_define_form);
8481 if (! NILP (define_form))
8482 {
8483 Fput (coding_system, Qcoding_system_define_form, Qnil);
8484 safe_eval (define_form);
8485 }
8486 if (!NILP (Fcoding_system_p (coding_system)))
8487 return coding_system;
8488 xsignal1 (Qcoding_system_error, coding_system);
8489 }
8490
8491 \f
8492 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8493 HIGHEST, return the coding system of the highest
8494 priority among the detected coding systems. Otherwise return a
8495 list of detected coding systems sorted by their priorities. If
8496 MULTIBYTEP, it is assumed that the bytes are in correct
8497 multibyte form but contains only ASCII and eight-bit chars.
8498 Otherwise, the bytes are raw bytes.
8499
8500 CODING-SYSTEM controls the detection as below:
8501
8502 If it is nil, detect both text-format and eol-format. If the
8503 text-format part of CODING-SYSTEM is already specified
8504 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8505 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8506 detect only text-format. */
8507
8508 Lisp_Object
8509 detect_coding_system (const unsigned char *src,
8510 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8511 bool highest, bool multibytep,
8512 Lisp_Object coding_system)
8513 {
8514 const unsigned char *src_end = src + src_bytes;
8515 Lisp_Object attrs, eol_type;
8516 Lisp_Object val = Qnil;
8517 struct coding_system coding;
8518 ptrdiff_t id;
8519 struct coding_detection_info detect_info;
8520 enum coding_category base_category;
8521 bool null_byte_found = 0, eight_bit_found = 0;
8522
8523 if (NILP (coding_system))
8524 coding_system = Qundecided;
8525 setup_coding_system (coding_system, &coding);
8526 attrs = CODING_ID_ATTRS (coding.id);
8527 eol_type = CODING_ID_EOL_TYPE (coding.id);
8528 coding_system = CODING_ATTR_BASE_NAME (attrs);
8529
8530 coding.source = src;
8531 coding.src_chars = src_chars;
8532 coding.src_bytes = src_bytes;
8533 coding.src_multibyte = multibytep;
8534 coding.consumed = 0;
8535 coding.mode |= CODING_MODE_LAST_BLOCK;
8536 coding.head_ascii = 0;
8537
8538 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8539
8540 /* At first, detect text-format if necessary. */
8541 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8542 if (base_category == coding_category_undecided)
8543 {
8544 enum coding_category category IF_LINT (= 0);
8545 struct coding_system *this IF_LINT (= NULL);
8546 int c, i;
8547 int inhibit_nbd /* null byte detection */
8548 = (coding.spec.undecided.inhibit_nbd > 0
8549 | (coding.spec.undecided.inhibit_nbd == 0
8550 & inhibit_null_byte_detection));
8551 int inhibit_ied /* iso escape detection */
8552 = (coding.spec.undecided.inhibit_ied > 0
8553 | (coding.spec.undecided.inhibit_ied == 0
8554 & inhibit_iso_escape_detection));
8555 int prefer_utf_8
8556 = coding.spec.undecided.prefer_utf_8;
8557
8558
8559 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8560 for (; src < src_end; src++)
8561 {
8562 c = *src;
8563 if (c & 0x80)
8564 {
8565 eight_bit_found = 1;
8566 if (null_byte_found)
8567 break;
8568 }
8569 else if (c < 0x20)
8570 {
8571 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8572 && ! inhibit_ied
8573 && ! detect_info.checked)
8574 {
8575 if (detect_coding_iso_2022 (&coding, &detect_info))
8576 {
8577 /* We have scanned the whole data. */
8578 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8579 {
8580 /* We didn't find an 8-bit code. We may
8581 have found a null-byte, but it's very
8582 rare that a binary file confirm to
8583 ISO-2022. */
8584 src = src_end;
8585 coding.head_ascii = src - coding.source;
8586 }
8587 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8588 break;
8589 }
8590 }
8591 else if (! c && !inhibit_nbd)
8592 {
8593 null_byte_found = 1;
8594 if (eight_bit_found)
8595 break;
8596 }
8597 if (! eight_bit_found)
8598 coding.head_ascii++;
8599 }
8600 else if (! eight_bit_found)
8601 coding.head_ascii++;
8602 }
8603
8604 if (null_byte_found || eight_bit_found
8605 || coding.head_ascii < coding.src_bytes
8606 || detect_info.found)
8607 {
8608 if (coding.head_ascii == coding.src_bytes)
8609 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8610 for (i = 0; i < coding_category_raw_text; i++)
8611 {
8612 category = coding_priorities[i];
8613 this = coding_categories + category;
8614 if (detect_info.found & (1 << category))
8615 break;
8616 }
8617 else
8618 {
8619 if (null_byte_found)
8620 {
8621 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8622 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8623 }
8624 else if (prefer_utf_8
8625 && detect_coding_utf_8 (&coding, &detect_info))
8626 {
8627 detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8628 detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8629 }
8630 for (i = 0; i < coding_category_raw_text; i++)
8631 {
8632 category = coding_priorities[i];
8633 this = coding_categories + category;
8634
8635 if (this->id < 0)
8636 {
8637 /* No coding system of this category is defined. */
8638 detect_info.rejected |= (1 << category);
8639 }
8640 else if (category >= coding_category_raw_text)
8641 continue;
8642 else if (detect_info.checked & (1 << category))
8643 {
8644 if (highest
8645 && (detect_info.found & (1 << category)))
8646 break;
8647 }
8648 else if ((*(this->detector)) (&coding, &detect_info)
8649 && highest
8650 && (detect_info.found & (1 << category)))
8651 {
8652 if (category == coding_category_utf_16_auto)
8653 {
8654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8655 category = coding_category_utf_16_le;
8656 else
8657 category = coding_category_utf_16_be;
8658 }
8659 break;
8660 }
8661 }
8662 }
8663 }
8664
8665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8666 || null_byte_found)
8667 {
8668 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8669 id = CODING_SYSTEM_ID (Qno_conversion);
8670 val = Fcons (make_number (id), Qnil);
8671 }
8672 else if (! detect_info.rejected && ! detect_info.found)
8673 {
8674 detect_info.found = CATEGORY_MASK_ANY;
8675 id = coding_categories[coding_category_undecided].id;
8676 val = Fcons (make_number (id), Qnil);
8677 }
8678 else if (highest)
8679 {
8680 if (detect_info.found)
8681 {
8682 detect_info.found = 1 << category;
8683 val = Fcons (make_number (this->id), Qnil);
8684 }
8685 else
8686 for (i = 0; i < coding_category_raw_text; i++)
8687 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8688 {
8689 detect_info.found = 1 << coding_priorities[i];
8690 id = coding_categories[coding_priorities[i]].id;
8691 val = Fcons (make_number (id), Qnil);
8692 break;
8693 }
8694 }
8695 else
8696 {
8697 int mask = detect_info.rejected | detect_info.found;
8698 int found = 0;
8699
8700 for (i = coding_category_raw_text - 1; i >= 0; i--)
8701 {
8702 category = coding_priorities[i];
8703 if (! (mask & (1 << category)))
8704 {
8705 found |= 1 << category;
8706 id = coding_categories[category].id;
8707 if (id >= 0)
8708 val = Fcons (make_number (id), val);
8709 }
8710 }
8711 for (i = coding_category_raw_text - 1; i >= 0; i--)
8712 {
8713 category = coding_priorities[i];
8714 if (detect_info.found & (1 << category))
8715 {
8716 id = coding_categories[category].id;
8717 val = Fcons (make_number (id), val);
8718 }
8719 }
8720 detect_info.found |= found;
8721 }
8722 }
8723 else if (base_category == coding_category_utf_8_auto)
8724 {
8725 if (detect_coding_utf_8 (&coding, &detect_info))
8726 {
8727 struct coding_system *this;
8728
8729 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8730 this = coding_categories + coding_category_utf_8_sig;
8731 else
8732 this = coding_categories + coding_category_utf_8_nosig;
8733 val = Fcons (make_number (this->id), Qnil);
8734 }
8735 }
8736 else if (base_category == coding_category_utf_16_auto)
8737 {
8738 if (detect_coding_utf_16 (&coding, &detect_info))
8739 {
8740 struct coding_system *this;
8741
8742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8743 this = coding_categories + coding_category_utf_16_le;
8744 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8745 this = coding_categories + coding_category_utf_16_be;
8746 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8747 this = coding_categories + coding_category_utf_16_be_nosig;
8748 else
8749 this = coding_categories + coding_category_utf_16_le_nosig;
8750 val = Fcons (make_number (this->id), Qnil);
8751 }
8752 }
8753 else
8754 {
8755 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8756 val = Fcons (make_number (coding.id), Qnil);
8757 }
8758
8759 /* Then, detect eol-format if necessary. */
8760 {
8761 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8762 Lisp_Object tail;
8763
8764 if (VECTORP (eol_type))
8765 {
8766 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8767 {
8768 if (null_byte_found)
8769 normal_eol = EOL_SEEN_LF;
8770 else
8771 normal_eol = detect_eol (coding.source, src_bytes,
8772 coding_category_raw_text);
8773 }
8774 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8775 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8776 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8777 coding_category_utf_16_be);
8778 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8779 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8780 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8781 coding_category_utf_16_le);
8782 }
8783 else
8784 {
8785 if (EQ (eol_type, Qunix))
8786 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8787 else if (EQ (eol_type, Qdos))
8788 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8789 else
8790 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8791 }
8792
8793 for (tail = val; CONSP (tail); tail = XCDR (tail))
8794 {
8795 enum coding_category category;
8796 int this_eol;
8797
8798 id = XINT (XCAR (tail));
8799 attrs = CODING_ID_ATTRS (id);
8800 category = XINT (CODING_ATTR_CATEGORY (attrs));
8801 eol_type = CODING_ID_EOL_TYPE (id);
8802 if (VECTORP (eol_type))
8803 {
8804 if (category == coding_category_utf_16_be
8805 || category == coding_category_utf_16_be_nosig)
8806 this_eol = utf_16_be_eol;
8807 else if (category == coding_category_utf_16_le
8808 || category == coding_category_utf_16_le_nosig)
8809 this_eol = utf_16_le_eol;
8810 else
8811 this_eol = normal_eol;
8812
8813 if (this_eol == EOL_SEEN_LF)
8814 XSETCAR (tail, AREF (eol_type, 0));
8815 else if (this_eol == EOL_SEEN_CRLF)
8816 XSETCAR (tail, AREF (eol_type, 1));
8817 else if (this_eol == EOL_SEEN_CR)
8818 XSETCAR (tail, AREF (eol_type, 2));
8819 else
8820 XSETCAR (tail, CODING_ID_NAME (id));
8821 }
8822 else
8823 XSETCAR (tail, CODING_ID_NAME (id));
8824 }
8825 }
8826
8827 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8828 }
8829
8830
8831 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8832 2, 3, 0,
8833 doc: /* Detect coding system of the text in the region between START and END.
8834 Return a list of possible coding systems ordered by priority.
8835 The coding systems to try and their priorities follows what
8836 the function `coding-system-priority-list' (which see) returns.
8837
8838 If only ASCII characters are found (except for such ISO-2022 control
8839 characters as ESC), it returns a list of single element `undecided'
8840 or its subsidiary coding system according to a detected end-of-line
8841 format.
8842
8843 If optional argument HIGHEST is non-nil, return the coding system of
8844 highest priority. */)
8845 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8846 {
8847 ptrdiff_t from, to;
8848 ptrdiff_t from_byte, to_byte;
8849
8850 validate_region (&start, &end);
8851 from = XINT (start), to = XINT (end);
8852 from_byte = CHAR_TO_BYTE (from);
8853 to_byte = CHAR_TO_BYTE (to);
8854
8855 if (from < GPT && to >= GPT)
8856 move_gap_both (to, to_byte);
8857
8858 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8859 to - from, to_byte - from_byte,
8860 !NILP (highest),
8861 !NILP (BVAR (current_buffer
8862 , enable_multibyte_characters)),
8863 Qnil);
8864 }
8865
8866 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8867 1, 2, 0,
8868 doc: /* Detect coding system of the text in STRING.
8869 Return a list of possible coding systems ordered by priority.
8870 The coding systems to try and their priorities follows what
8871 the function `coding-system-priority-list' (which see) returns.
8872
8873 If only ASCII characters are found (except for such ISO-2022 control
8874 characters as ESC), it returns a list of single element `undecided'
8875 or its subsidiary coding system according to a detected end-of-line
8876 format.
8877
8878 If optional argument HIGHEST is non-nil, return the coding system of
8879 highest priority. */)
8880 (Lisp_Object string, Lisp_Object highest)
8881 {
8882 CHECK_STRING (string);
8883
8884 return detect_coding_system (SDATA (string),
8885 SCHARS (string), SBYTES (string),
8886 !NILP (highest), STRING_MULTIBYTE (string),
8887 Qnil);
8888 }
8889
8890
8891 static bool
8892 char_encodable_p (int c, Lisp_Object attrs)
8893 {
8894 Lisp_Object tail;
8895 struct charset *charset;
8896 Lisp_Object translation_table;
8897
8898 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8899 if (! NILP (translation_table))
8900 c = translate_char (translation_table, c);
8901 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8902 CONSP (tail); tail = XCDR (tail))
8903 {
8904 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8905 if (CHAR_CHARSET_P (c, charset))
8906 break;
8907 }
8908 return (! NILP (tail));
8909 }
8910
8911
8912 /* Return a list of coding systems that safely encode the text between
8913 START and END. If EXCLUDE is non-nil, it is a list of coding
8914 systems not to check. The returned list doesn't contain any such
8915 coding systems. In any case, if the text contains only ASCII or is
8916 unibyte, return t. */
8917
8918 DEFUN ("find-coding-systems-region-internal",
8919 Ffind_coding_systems_region_internal,
8920 Sfind_coding_systems_region_internal, 2, 3, 0,
8921 doc: /* Internal use only. */)
8922 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8923 {
8924 Lisp_Object coding_attrs_list, safe_codings;
8925 ptrdiff_t start_byte, end_byte;
8926 const unsigned char *p, *pbeg, *pend;
8927 int c;
8928 Lisp_Object tail, elt, work_table;
8929
8930 if (STRINGP (start))
8931 {
8932 if (!STRING_MULTIBYTE (start)
8933 || SCHARS (start) == SBYTES (start))
8934 return Qt;
8935 start_byte = 0;
8936 end_byte = SBYTES (start);
8937 }
8938 else
8939 {
8940 CHECK_NUMBER_COERCE_MARKER (start);
8941 CHECK_NUMBER_COERCE_MARKER (end);
8942 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8943 args_out_of_range (start, end);
8944 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8945 return Qt;
8946 start_byte = CHAR_TO_BYTE (XINT (start));
8947 end_byte = CHAR_TO_BYTE (XINT (end));
8948 if (XINT (end) - XINT (start) == end_byte - start_byte)
8949 return Qt;
8950
8951 if (XINT (start) < GPT && XINT (end) > GPT)
8952 {
8953 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8954 move_gap_both (XINT (start), start_byte);
8955 else
8956 move_gap_both (XINT (end), end_byte);
8957 }
8958 }
8959
8960 coding_attrs_list = Qnil;
8961 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8962 if (NILP (exclude)
8963 || NILP (Fmemq (XCAR (tail), exclude)))
8964 {
8965 Lisp_Object attrs;
8966
8967 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8968 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
8969 {
8970 ASET (attrs, coding_attr_trans_tbl,
8971 get_translation_table (attrs, 1, NULL));
8972 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8973 }
8974 }
8975
8976 if (STRINGP (start))
8977 p = pbeg = SDATA (start);
8978 else
8979 p = pbeg = BYTE_POS_ADDR (start_byte);
8980 pend = p + (end_byte - start_byte);
8981
8982 while (p < pend && ASCII_BYTE_P (*p)) p++;
8983 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8984
8985 work_table = Fmake_char_table (Qnil, Qnil);
8986 while (p < pend)
8987 {
8988 if (ASCII_BYTE_P (*p))
8989 p++;
8990 else
8991 {
8992 c = STRING_CHAR_ADVANCE (p);
8993 if (!NILP (char_table_ref (work_table, c)))
8994 /* This character was already checked. Ignore it. */
8995 continue;
8996
8997 charset_map_loaded = 0;
8998 for (tail = coding_attrs_list; CONSP (tail);)
8999 {
9000 elt = XCAR (tail);
9001 if (NILP (elt))
9002 tail = XCDR (tail);
9003 else if (char_encodable_p (c, elt))
9004 tail = XCDR (tail);
9005 else if (CONSP (XCDR (tail)))
9006 {
9007 XSETCAR (tail, XCAR (XCDR (tail)));
9008 XSETCDR (tail, XCDR (XCDR (tail)));
9009 }
9010 else
9011 {
9012 XSETCAR (tail, Qnil);
9013 tail = XCDR (tail);
9014 }
9015 }
9016 if (charset_map_loaded)
9017 {
9018 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9019
9020 if (STRINGP (start))
9021 pbeg = SDATA (start);
9022 else
9023 pbeg = BYTE_POS_ADDR (start_byte);
9024 p = pbeg + p_offset;
9025 pend = pbeg + pend_offset;
9026 }
9027 char_table_set (work_table, c, Qt);
9028 }
9029 }
9030
9031 safe_codings = list2 (Qraw_text, Qno_conversion);
9032 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9033 if (! NILP (XCAR (tail)))
9034 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9035
9036 return safe_codings;
9037 }
9038
9039
9040 DEFUN ("unencodable-char-position", Funencodable_char_position,
9041 Sunencodable_char_position, 3, 5, 0,
9042 doc: /*
9043 Return position of first un-encodable character in a region.
9044 START and END specify the region and CODING-SYSTEM specifies the
9045 encoding to check. Return nil if CODING-SYSTEM does encode the region.
9046
9047 If optional 4th argument COUNT is non-nil, it specifies at most how
9048 many un-encodable characters to search. In this case, the value is a
9049 list of positions.
9050
9051 If optional 5th argument STRING is non-nil, it is a string to search
9052 for un-encodable characters. In that case, START and END are indexes
9053 to the string. */)
9054 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9055 {
9056 EMACS_INT n;
9057 struct coding_system coding;
9058 Lisp_Object attrs, charset_list, translation_table;
9059 Lisp_Object positions;
9060 ptrdiff_t from, to;
9061 const unsigned char *p, *stop, *pend;
9062 bool ascii_compatible;
9063
9064 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9065 attrs = CODING_ID_ATTRS (coding.id);
9066 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9067 return Qnil;
9068 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9069 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9070 translation_table = get_translation_table (attrs, 1, NULL);
9071
9072 if (NILP (string))
9073 {
9074 validate_region (&start, &end);
9075 from = XINT (start);
9076 to = XINT (end);
9077 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9078 || (ascii_compatible
9079 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9080 return Qnil;
9081 p = CHAR_POS_ADDR (from);
9082 pend = CHAR_POS_ADDR (to);
9083 if (from < GPT && to >= GPT)
9084 stop = GPT_ADDR;
9085 else
9086 stop = pend;
9087 }
9088 else
9089 {
9090 CHECK_STRING (string);
9091 CHECK_NATNUM (start);
9092 CHECK_NATNUM (end);
9093 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9094 args_out_of_range_3 (string, start, end);
9095 from = XINT (start);
9096 to = XINT (end);
9097 if (! STRING_MULTIBYTE (string))
9098 return Qnil;
9099 p = SDATA (string) + string_char_to_byte (string, from);
9100 stop = pend = SDATA (string) + string_char_to_byte (string, to);
9101 if (ascii_compatible && (to - from) == (pend - p))
9102 return Qnil;
9103 }
9104
9105 if (NILP (count))
9106 n = 1;
9107 else
9108 {
9109 CHECK_NATNUM (count);
9110 n = XINT (count);
9111 }
9112
9113 positions = Qnil;
9114 charset_map_loaded = 0;
9115 while (1)
9116 {
9117 int c;
9118
9119 if (ascii_compatible)
9120 while (p < stop && ASCII_BYTE_P (*p))
9121 p++, from++;
9122 if (p >= stop)
9123 {
9124 if (p >= pend)
9125 break;
9126 stop = pend;
9127 p = GAP_END_ADDR;
9128 }
9129
9130 c = STRING_CHAR_ADVANCE (p);
9131 if (! (ASCII_CHAR_P (c) && ascii_compatible)
9132 && ! char_charset (translate_char (translation_table, c),
9133 charset_list, NULL))
9134 {
9135 positions = Fcons (make_number (from), positions);
9136 n--;
9137 if (n == 0)
9138 break;
9139 }
9140
9141 from++;
9142 if (charset_map_loaded && NILP (string))
9143 {
9144 p = CHAR_POS_ADDR (from);
9145 pend = CHAR_POS_ADDR (to);
9146 if (from < GPT && to >= GPT)
9147 stop = GPT_ADDR;
9148 else
9149 stop = pend;
9150 charset_map_loaded = 0;
9151 }
9152 }
9153
9154 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9155 }
9156
9157
9158 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9159 Scheck_coding_systems_region, 3, 3, 0,
9160 doc: /* Check if the region is encodable by coding systems.
9161
9162 START and END are buffer positions specifying the region.
9163 CODING-SYSTEM-LIST is a list of coding systems to check.
9164
9165 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9166 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9167 whole region, POS0, POS1, ... are buffer positions where non-encodable
9168 characters are found.
9169
9170 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9171 value is nil.
9172
9173 START may be a string. In that case, check if the string is
9174 encodable, and the value contains indices to the string instead of
9175 buffer positions. END is ignored.
9176
9177 If the current buffer (or START if it is a string) is unibyte, the value
9178 is nil. */)
9179 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9180 {
9181 Lisp_Object list;
9182 ptrdiff_t start_byte, end_byte;
9183 ptrdiff_t pos;
9184 const unsigned char *p, *pbeg, *pend;
9185 int c;
9186 Lisp_Object tail, elt, attrs;
9187
9188 if (STRINGP (start))
9189 {
9190 if (!STRING_MULTIBYTE (start)
9191 || SCHARS (start) == SBYTES (start))
9192 return Qnil;
9193 start_byte = 0;
9194 end_byte = SBYTES (start);
9195 pos = 0;
9196 }
9197 else
9198 {
9199 CHECK_NUMBER_COERCE_MARKER (start);
9200 CHECK_NUMBER_COERCE_MARKER (end);
9201 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9202 args_out_of_range (start, end);
9203 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9204 return Qnil;
9205 start_byte = CHAR_TO_BYTE (XINT (start));
9206 end_byte = CHAR_TO_BYTE (XINT (end));
9207 if (XINT (end) - XINT (start) == end_byte - start_byte)
9208 return Qnil;
9209
9210 if (XINT (start) < GPT && XINT (end) > GPT)
9211 {
9212 if ((GPT - XINT (start)) < (XINT (end) - GPT))
9213 move_gap_both (XINT (start), start_byte);
9214 else
9215 move_gap_both (XINT (end), end_byte);
9216 }
9217 pos = XINT (start);
9218 }
9219
9220 list = Qnil;
9221 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9222 {
9223 elt = XCAR (tail);
9224 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9225 ASET (attrs, coding_attr_trans_tbl,
9226 get_translation_table (attrs, 1, NULL));
9227 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
9228 }
9229
9230 if (STRINGP (start))
9231 p = pbeg = SDATA (start);
9232 else
9233 p = pbeg = BYTE_POS_ADDR (start_byte);
9234 pend = p + (end_byte - start_byte);
9235
9236 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9237 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9238
9239 while (p < pend)
9240 {
9241 if (ASCII_BYTE_P (*p))
9242 p++;
9243 else
9244 {
9245 c = STRING_CHAR_ADVANCE (p);
9246
9247 charset_map_loaded = 0;
9248 for (tail = list; CONSP (tail); tail = XCDR (tail))
9249 {
9250 elt = XCDR (XCAR (tail));
9251 if (! char_encodable_p (c, XCAR (elt)))
9252 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9253 }
9254 if (charset_map_loaded)
9255 {
9256 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9257
9258 if (STRINGP (start))
9259 pbeg = SDATA (start);
9260 else
9261 pbeg = BYTE_POS_ADDR (start_byte);
9262 p = pbeg + p_offset;
9263 pend = pbeg + pend_offset;
9264 }
9265 }
9266 pos++;
9267 }
9268
9269 tail = list;
9270 list = Qnil;
9271 for (; CONSP (tail); tail = XCDR (tail))
9272 {
9273 elt = XCAR (tail);
9274 if (CONSP (XCDR (XCDR (elt))))
9275 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9276 list);
9277 }
9278
9279 return list;
9280 }
9281
9282
9283 static Lisp_Object
9284 code_convert_region (Lisp_Object start, Lisp_Object end,
9285 Lisp_Object coding_system, Lisp_Object dst_object,
9286 bool encodep, bool norecord)
9287 {
9288 struct coding_system coding;
9289 ptrdiff_t from, from_byte, to, to_byte;
9290 Lisp_Object src_object;
9291
9292 if (NILP (coding_system))
9293 coding_system = Qno_conversion;
9294 else
9295 CHECK_CODING_SYSTEM (coding_system);
9296 src_object = Fcurrent_buffer ();
9297 if (NILP (dst_object))
9298 dst_object = src_object;
9299 else if (! EQ (dst_object, Qt))
9300 CHECK_BUFFER (dst_object);
9301
9302 validate_region (&start, &end);
9303 from = XFASTINT (start);
9304 from_byte = CHAR_TO_BYTE (from);
9305 to = XFASTINT (end);
9306 to_byte = CHAR_TO_BYTE (to);
9307
9308 setup_coding_system (coding_system, &coding);
9309 coding.mode |= CODING_MODE_LAST_BLOCK;
9310
9311 if (encodep)
9312 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9313 dst_object);
9314 else
9315 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9316 dst_object);
9317 if (! norecord)
9318 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9319
9320 return (BUFFERP (dst_object)
9321 ? make_number (coding.produced_char)
9322 : coding.dst_object);
9323 }
9324
9325
9326 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9327 3, 4, "r\nzCoding system: ",
9328 doc: /* Decode the current region from the specified coding system.
9329 When called from a program, takes four arguments:
9330 START, END, CODING-SYSTEM, and DESTINATION.
9331 START and END are buffer positions.
9332
9333 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9334 If nil, the region between START and END is replaced by the decoded text.
9335 If buffer, the decoded text is inserted in that buffer after point (point
9336 does not move).
9337 In those cases, the length of the decoded text is returned.
9338 If DESTINATION is t, the decoded text is returned.
9339
9340 This function sets `last-coding-system-used' to the precise coding system
9341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9342 not fully specified.) */)
9343 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9344 {
9345 return code_convert_region (start, end, coding_system, destination, 0, 0);
9346 }
9347
9348 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9349 3, 4, "r\nzCoding system: ",
9350 doc: /* Encode the current region by specified coding system.
9351 When called from a program, takes four arguments:
9352 START, END, CODING-SYSTEM and DESTINATION.
9353 START and END are buffer positions.
9354
9355 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9356 If nil, the region between START and END is replace by the encoded text.
9357 If buffer, the encoded text is inserted in that buffer after point (point
9358 does not move).
9359 In those cases, the length of the encoded text is returned.
9360 If DESTINATION is t, the encoded text is returned.
9361
9362 This function sets `last-coding-system-used' to the precise coding system
9363 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9364 not fully specified.) */)
9365 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9366 {
9367 return code_convert_region (start, end, coding_system, destination, 1, 0);
9368 }
9369
9370 Lisp_Object
9371 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9372 Lisp_Object dst_object, bool encodep, bool nocopy,
9373 bool norecord)
9374 {
9375 struct coding_system coding;
9376 ptrdiff_t chars, bytes;
9377
9378 CHECK_STRING (string);
9379 if (NILP (coding_system))
9380 {
9381 if (! norecord)
9382 Vlast_coding_system_used = Qno_conversion;
9383 if (NILP (dst_object))
9384 return (nocopy ? Fcopy_sequence (string) : string);
9385 }
9386
9387 if (NILP (coding_system))
9388 coding_system = Qno_conversion;
9389 else
9390 CHECK_CODING_SYSTEM (coding_system);
9391 if (NILP (dst_object))
9392 dst_object = Qt;
9393 else if (! EQ (dst_object, Qt))
9394 CHECK_BUFFER (dst_object);
9395
9396 setup_coding_system (coding_system, &coding);
9397 coding.mode |= CODING_MODE_LAST_BLOCK;
9398 chars = SCHARS (string);
9399 bytes = SBYTES (string);
9400 if (encodep)
9401 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9402 else
9403 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9404 if (! norecord)
9405 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9406
9407 return (BUFFERP (dst_object)
9408 ? make_number (coding.produced_char)
9409 : coding.dst_object);
9410 }
9411
9412
9413 /* Encode or decode STRING according to CODING_SYSTEM.
9414 Do not set Vlast_coding_system_used.
9415
9416 This function is called only from macros DECODE_FILE and
9417 ENCODE_FILE, thus we ignore character composition. */
9418
9419 Lisp_Object
9420 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9421 bool encodep)
9422 {
9423 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9424 }
9425
9426
9427 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9428 2, 4, 0,
9429 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9430
9431 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9432 if the decoding operation is trivial.
9433
9434 Optional fourth arg BUFFER non-nil means that the decoded text is
9435 inserted in that buffer after point (point does not move). In this
9436 case, the return value is the length of the decoded text.
9437
9438 This function sets `last-coding-system-used' to the precise coding system
9439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9440 not fully specified.) */)
9441 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9442 {
9443 return code_convert_string (string, coding_system, buffer,
9444 0, ! NILP (nocopy), 0);
9445 }
9446
9447 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9448 2, 4, 0,
9449 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9450
9451 Optional third arg NOCOPY non-nil means it is OK to return STRING
9452 itself if the encoding operation is trivial.
9453
9454 Optional fourth arg BUFFER non-nil means that the encoded text is
9455 inserted in that buffer after point (point does not move). In this
9456 case, the return value is the length of the encoded text.
9457
9458 This function sets `last-coding-system-used' to the precise coding system
9459 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9460 not fully specified.) */)
9461 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9462 {
9463 return code_convert_string (string, coding_system, buffer,
9464 1, ! NILP (nocopy), 0);
9465 }
9466
9467 \f
9468 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9469 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9470 Return the corresponding character. */)
9471 (Lisp_Object code)
9472 {
9473 Lisp_Object spec, attrs, val;
9474 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9475 EMACS_INT ch;
9476 int c;
9477
9478 CHECK_NATNUM (code);
9479 ch = XFASTINT (code);
9480 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9481 attrs = AREF (spec, 0);
9482
9483 if (ASCII_BYTE_P (ch)
9484 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9485 return code;
9486
9487 val = CODING_ATTR_CHARSET_LIST (attrs);
9488 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9489 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9490 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9491
9492 if (ch <= 0x7F)
9493 {
9494 c = ch;
9495 charset = charset_roman;
9496 }
9497 else if (ch >= 0xA0 && ch < 0xDF)
9498 {
9499 c = ch - 0x80;
9500 charset = charset_kana;
9501 }
9502 else
9503 {
9504 EMACS_INT c1 = ch >> 8;
9505 int c2 = ch & 0xFF;
9506
9507 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9508 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9509 error ("Invalid code: %"pI"d", ch);
9510 c = ch;
9511 SJIS_TO_JIS (c);
9512 charset = charset_kanji;
9513 }
9514 c = DECODE_CHAR (charset, c);
9515 if (c < 0)
9516 error ("Invalid code: %"pI"d", ch);
9517 return make_number (c);
9518 }
9519
9520
9521 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9522 doc: /* Encode a Japanese character CH to shift_jis encoding.
9523 Return the corresponding code in SJIS. */)
9524 (Lisp_Object ch)
9525 {
9526 Lisp_Object spec, attrs, charset_list;
9527 int c;
9528 struct charset *charset;
9529 unsigned code;
9530
9531 CHECK_CHARACTER (ch);
9532 c = XFASTINT (ch);
9533 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9534 attrs = AREF (spec, 0);
9535
9536 if (ASCII_CHAR_P (c)
9537 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9538 return ch;
9539
9540 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9541 charset = char_charset (c, charset_list, &code);
9542 if (code == CHARSET_INVALID_CODE (charset))
9543 error ("Can't encode by shift_jis encoding: %c", c);
9544 JIS_TO_SJIS (code);
9545
9546 return make_number (code);
9547 }
9548
9549 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9550 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9551 Return the corresponding character. */)
9552 (Lisp_Object code)
9553 {
9554 Lisp_Object spec, attrs, val;
9555 struct charset *charset_roman, *charset_big5, *charset;
9556 EMACS_INT ch;
9557 int c;
9558
9559 CHECK_NATNUM (code);
9560 ch = XFASTINT (code);
9561 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9562 attrs = AREF (spec, 0);
9563
9564 if (ASCII_BYTE_P (ch)
9565 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9566 return code;
9567
9568 val = CODING_ATTR_CHARSET_LIST (attrs);
9569 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9570 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9571
9572 if (ch <= 0x7F)
9573 {
9574 c = ch;
9575 charset = charset_roman;
9576 }
9577 else
9578 {
9579 EMACS_INT b1 = ch >> 8;
9580 int b2 = ch & 0x7F;
9581 if (b1 < 0xA1 || b1 > 0xFE
9582 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9583 error ("Invalid code: %"pI"d", ch);
9584 c = ch;
9585 charset = charset_big5;
9586 }
9587 c = DECODE_CHAR (charset, c);
9588 if (c < 0)
9589 error ("Invalid code: %"pI"d", ch);
9590 return make_number (c);
9591 }
9592
9593 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9594 doc: /* Encode the Big5 character CH to BIG5 coding system.
9595 Return the corresponding character code in Big5. */)
9596 (Lisp_Object ch)
9597 {
9598 Lisp_Object spec, attrs, charset_list;
9599 struct charset *charset;
9600 int c;
9601 unsigned code;
9602
9603 CHECK_CHARACTER (ch);
9604 c = XFASTINT (ch);
9605 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9606 attrs = AREF (spec, 0);
9607 if (ASCII_CHAR_P (c)
9608 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9609 return ch;
9610
9611 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9612 charset = char_charset (c, charset_list, &code);
9613 if (code == CHARSET_INVALID_CODE (charset))
9614 error ("Can't encode by Big5 encoding: %c", c);
9615
9616 return make_number (code);
9617 }
9618
9619 \f
9620 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9621 Sset_terminal_coding_system_internal, 1, 2, 0,
9622 doc: /* Internal use only. */)
9623 (Lisp_Object coding_system, Lisp_Object terminal)
9624 {
9625 struct terminal *term = get_terminal (terminal, 1);
9626 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9627 CHECK_SYMBOL (coding_system);
9628 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9629 /* We had better not send unsafe characters to terminal. */
9630 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9631 /* Character composition should be disabled. */
9632 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9633 terminal_coding->src_multibyte = 1;
9634 terminal_coding->dst_multibyte = 0;
9635 tset_charset_list
9636 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9637 ? coding_charset_list (terminal_coding)
9638 : Fcons (make_number (charset_ascii), Qnil)));
9639 return Qnil;
9640 }
9641
9642 DEFUN ("set-safe-terminal-coding-system-internal",
9643 Fset_safe_terminal_coding_system_internal,
9644 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9645 doc: /* Internal use only. */)
9646 (Lisp_Object coding_system)
9647 {
9648 CHECK_SYMBOL (coding_system);
9649 setup_coding_system (Fcheck_coding_system (coding_system),
9650 &safe_terminal_coding);
9651 /* Character composition should be disabled. */
9652 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9653 safe_terminal_coding.src_multibyte = 1;
9654 safe_terminal_coding.dst_multibyte = 0;
9655 return Qnil;
9656 }
9657
9658 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9659 Sterminal_coding_system, 0, 1, 0,
9660 doc: /* Return coding system specified for terminal output on the given terminal.
9661 TERMINAL may be a terminal object, a frame, or nil for the selected
9662 frame's terminal device. */)
9663 (Lisp_Object terminal)
9664 {
9665 struct coding_system *terminal_coding
9666 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9667 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9668
9669 /* For backward compatibility, return nil if it is `undecided'. */
9670 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9671 }
9672
9673 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9674 Sset_keyboard_coding_system_internal, 1, 2, 0,
9675 doc: /* Internal use only. */)
9676 (Lisp_Object coding_system, Lisp_Object terminal)
9677 {
9678 struct terminal *t = get_terminal (terminal, 1);
9679 CHECK_SYMBOL (coding_system);
9680 if (NILP (coding_system))
9681 coding_system = Qno_conversion;
9682 else
9683 Fcheck_coding_system (coding_system);
9684 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9685 /* Character composition should be disabled. */
9686 TERMINAL_KEYBOARD_CODING (t)->common_flags
9687 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9688 return Qnil;
9689 }
9690
9691 DEFUN ("keyboard-coding-system",
9692 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9693 doc: /* Return coding system specified for decoding keyboard input. */)
9694 (Lisp_Object terminal)
9695 {
9696 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9697 (get_terminal (terminal, 1))->id);
9698 }
9699
9700 \f
9701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9702 Sfind_operation_coding_system, 1, MANY, 0,
9703 doc: /* Choose a coding system for an operation based on the target name.
9704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9705 DECODING-SYSTEM is the coding system to use for decoding
9706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9707 for encoding (in case OPERATION does encoding).
9708
9709 The first argument OPERATION specifies an I/O primitive:
9710 For file I/O, `insert-file-contents' or `write-region'.
9711 For process I/O, `call-process', `call-process-region', or `start-process'.
9712 For network I/O, `open-network-stream'.
9713
9714 The remaining arguments should be the same arguments that were passed
9715 to the primitive. Depending on which primitive, one of those arguments
9716 is selected as the TARGET. For example, if OPERATION does file I/O,
9717 whichever argument specifies the file name is TARGET.
9718
9719 TARGET has a meaning which depends on OPERATION:
9720 For file I/O, TARGET is a file name (except for the special case below).
9721 For process I/O, TARGET is a process name.
9722 For network I/O, TARGET is a service name or a port number.
9723
9724 This function looks up what is specified for TARGET in
9725 `file-coding-system-alist', `process-coding-system-alist',
9726 or `network-coding-system-alist' depending on OPERATION.
9727 They may specify a coding system, a cons of coding systems,
9728 or a function symbol to call.
9729 In the last case, we call the function with one argument,
9730 which is a list of all the arguments given to this function.
9731 If the function can't decide a coding system, it can return
9732 `undecided' so that the normal code-detection is performed.
9733
9734 If OPERATION is `insert-file-contents', the argument corresponding to
9735 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9736 file name to look up, and BUFFER is a buffer that contains the file's
9737 contents (not yet decoded). If `file-coding-system-alist' specifies a
9738 function to call for FILENAME, that function should examine the
9739 contents of BUFFER instead of reading the file.
9740
9741 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9742 (ptrdiff_t nargs, Lisp_Object *args)
9743 {
9744 Lisp_Object operation, target_idx, target, val;
9745 register Lisp_Object chain;
9746
9747 if (nargs < 2)
9748 error ("Too few arguments");
9749 operation = args[0];
9750 if (!SYMBOLP (operation)
9751 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9752 error ("Invalid first argument");
9753 if (nargs <= 1 + XFASTINT (target_idx))
9754 error ("Too few arguments for operation `%s'",
9755 SDATA (SYMBOL_NAME (operation)));
9756 target = args[XFASTINT (target_idx) + 1];
9757 if (!(STRINGP (target)
9758 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9759 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9760 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9761 error ("Invalid argument %"pI"d of operation `%s'",
9762 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9763 if (CONSP (target))
9764 target = XCAR (target);
9765
9766 chain = ((EQ (operation, Qinsert_file_contents)
9767 || EQ (operation, Qwrite_region))
9768 ? Vfile_coding_system_alist
9769 : (EQ (operation, Qopen_network_stream)
9770 ? Vnetwork_coding_system_alist
9771 : Vprocess_coding_system_alist));
9772 if (NILP (chain))
9773 return Qnil;
9774
9775 for (; CONSP (chain); chain = XCDR (chain))
9776 {
9777 Lisp_Object elt;
9778
9779 elt = XCAR (chain);
9780 if (CONSP (elt)
9781 && ((STRINGP (target)
9782 && STRINGP (XCAR (elt))
9783 && fast_string_match (XCAR (elt), target) >= 0)
9784 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9785 {
9786 val = XCDR (elt);
9787 /* Here, if VAL is both a valid coding system and a valid
9788 function symbol, we return VAL as a coding system. */
9789 if (CONSP (val))
9790 return val;
9791 if (! SYMBOLP (val))
9792 return Qnil;
9793 if (! NILP (Fcoding_system_p (val)))
9794 return Fcons (val, val);
9795 if (! NILP (Ffboundp (val)))
9796 {
9797 /* We use call1 rather than safe_call1
9798 so as to get bug reports about functions called here
9799 which don't handle the current interface. */
9800 val = call1 (val, Flist (nargs, args));
9801 if (CONSP (val))
9802 return val;
9803 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9804 return Fcons (val, val);
9805 }
9806 return Qnil;
9807 }
9808 }
9809 return Qnil;
9810 }
9811
9812 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9813 Sset_coding_system_priority, 0, MANY, 0,
9814 doc: /* Assign higher priority to the coding systems given as arguments.
9815 If multiple coding systems belong to the same category,
9816 all but the first one are ignored.
9817
9818 usage: (set-coding-system-priority &rest coding-systems) */)
9819 (ptrdiff_t nargs, Lisp_Object *args)
9820 {
9821 ptrdiff_t i, j;
9822 bool changed[coding_category_max];
9823 enum coding_category priorities[coding_category_max];
9824
9825 memset (changed, 0, sizeof changed);
9826
9827 for (i = j = 0; i < nargs; i++)
9828 {
9829 enum coding_category category;
9830 Lisp_Object spec, attrs;
9831
9832 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9833 attrs = AREF (spec, 0);
9834 category = XINT (CODING_ATTR_CATEGORY (attrs));
9835 if (changed[category])
9836 /* Ignore this coding system because a coding system of the
9837 same category already had a higher priority. */
9838 continue;
9839 changed[category] = 1;
9840 priorities[j++] = category;
9841 if (coding_categories[category].id >= 0
9842 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9843 setup_coding_system (args[i], &coding_categories[category]);
9844 Fset (AREF (Vcoding_category_table, category), args[i]);
9845 }
9846
9847 /* Now we have decided top J priorities. Reflect the order of the
9848 original priorities to the remaining priorities. */
9849
9850 for (i = j, j = 0; i < coding_category_max; i++, j++)
9851 {
9852 while (j < coding_category_max
9853 && changed[coding_priorities[j]])
9854 j++;
9855 if (j == coding_category_max)
9856 emacs_abort ();
9857 priorities[i] = coding_priorities[j];
9858 }
9859
9860 memcpy (coding_priorities, priorities, sizeof priorities);
9861
9862 /* Update `coding-category-list'. */
9863 Vcoding_category_list = Qnil;
9864 for (i = coding_category_max; i-- > 0; )
9865 Vcoding_category_list
9866 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9867 Vcoding_category_list);
9868
9869 return Qnil;
9870 }
9871
9872 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9873 Scoding_system_priority_list, 0, 1, 0,
9874 doc: /* Return a list of coding systems ordered by their priorities.
9875 The list contains a subset of coding systems; i.e. coding systems
9876 assigned to each coding category (see `coding-category-list').
9877
9878 HIGHESTP non-nil means just return the highest priority one. */)
9879 (Lisp_Object highestp)
9880 {
9881 int i;
9882 Lisp_Object val;
9883
9884 for (i = 0, val = Qnil; i < coding_category_max; i++)
9885 {
9886 enum coding_category category = coding_priorities[i];
9887 int id = coding_categories[category].id;
9888 Lisp_Object attrs;
9889
9890 if (id < 0)
9891 continue;
9892 attrs = CODING_ID_ATTRS (id);
9893 if (! NILP (highestp))
9894 return CODING_ATTR_BASE_NAME (attrs);
9895 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9896 }
9897 return Fnreverse (val);
9898 }
9899
9900 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9901
9902 static Lisp_Object
9903 make_subsidiaries (Lisp_Object base)
9904 {
9905 Lisp_Object subsidiaries;
9906 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9907 char *buf = alloca (base_name_len + 6);
9908 int i;
9909
9910 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9911 subsidiaries = make_uninit_vector (3);
9912 for (i = 0; i < 3; i++)
9913 {
9914 strcpy (buf + base_name_len, suffixes[i]);
9915 ASET (subsidiaries, i, intern (buf));
9916 }
9917 return subsidiaries;
9918 }
9919
9920
9921 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9922 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9923 doc: /* For internal use only.
9924 usage: (define-coding-system-internal ...) */)
9925 (ptrdiff_t nargs, Lisp_Object *args)
9926 {
9927 Lisp_Object name;
9928 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9929 Lisp_Object attrs; /* Vector of attributes. */
9930 Lisp_Object eol_type;
9931 Lisp_Object aliases;
9932 Lisp_Object coding_type, charset_list, safe_charsets;
9933 enum coding_category category;
9934 Lisp_Object tail, val;
9935 int max_charset_id = 0;
9936 int i;
9937
9938 if (nargs < coding_arg_max)
9939 goto short_args;
9940
9941 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9942
9943 name = args[coding_arg_name];
9944 CHECK_SYMBOL (name);
9945 ASET (attrs, coding_attr_base_name, name);
9946
9947 val = args[coding_arg_mnemonic];
9948 if (! STRINGP (val))
9949 CHECK_CHARACTER (val);
9950 ASET (attrs, coding_attr_mnemonic, val);
9951
9952 coding_type = args[coding_arg_coding_type];
9953 CHECK_SYMBOL (coding_type);
9954 ASET (attrs, coding_attr_type, coding_type);
9955
9956 charset_list = args[coding_arg_charset_list];
9957 if (SYMBOLP (charset_list))
9958 {
9959 if (EQ (charset_list, Qiso_2022))
9960 {
9961 if (! EQ (coding_type, Qiso_2022))
9962 error ("Invalid charset-list");
9963 charset_list = Viso_2022_charset_list;
9964 }
9965 else if (EQ (charset_list, Qemacs_mule))
9966 {
9967 if (! EQ (coding_type, Qemacs_mule))
9968 error ("Invalid charset-list");
9969 charset_list = Vemacs_mule_charset_list;
9970 }
9971 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9972 {
9973 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9974 error ("Invalid charset-list");
9975 if (max_charset_id < XFASTINT (XCAR (tail)))
9976 max_charset_id = XFASTINT (XCAR (tail));
9977 }
9978 }
9979 else
9980 {
9981 charset_list = Fcopy_sequence (charset_list);
9982 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9983 {
9984 struct charset *charset;
9985
9986 val = XCAR (tail);
9987 CHECK_CHARSET_GET_CHARSET (val, charset);
9988 if (EQ (coding_type, Qiso_2022)
9989 ? CHARSET_ISO_FINAL (charset) < 0
9990 : EQ (coding_type, Qemacs_mule)
9991 ? CHARSET_EMACS_MULE_ID (charset) < 0
9992 : 0)
9993 error ("Can't handle charset `%s'",
9994 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9995
9996 XSETCAR (tail, make_number (charset->id));
9997 if (max_charset_id < charset->id)
9998 max_charset_id = charset->id;
9999 }
10000 }
10001 ASET (attrs, coding_attr_charset_list, charset_list);
10002
10003 safe_charsets = make_uninit_string (max_charset_id + 1);
10004 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10005 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10006 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10007 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10008
10009 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10010
10011 val = args[coding_arg_decode_translation_table];
10012 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10013 CHECK_SYMBOL (val);
10014 ASET (attrs, coding_attr_decode_tbl, val);
10015
10016 val = args[coding_arg_encode_translation_table];
10017 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10018 CHECK_SYMBOL (val);
10019 ASET (attrs, coding_attr_encode_tbl, val);
10020
10021 val = args[coding_arg_post_read_conversion];
10022 CHECK_SYMBOL (val);
10023 ASET (attrs, coding_attr_post_read, val);
10024
10025 val = args[coding_arg_pre_write_conversion];
10026 CHECK_SYMBOL (val);
10027 ASET (attrs, coding_attr_pre_write, val);
10028
10029 val = args[coding_arg_default_char];
10030 if (NILP (val))
10031 ASET (attrs, coding_attr_default_char, make_number (' '));
10032 else
10033 {
10034 CHECK_CHARACTER (val);
10035 ASET (attrs, coding_attr_default_char, val);
10036 }
10037
10038 val = args[coding_arg_for_unibyte];
10039 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10040
10041 val = args[coding_arg_plist];
10042 CHECK_LIST (val);
10043 ASET (attrs, coding_attr_plist, val);
10044
10045 if (EQ (coding_type, Qcharset))
10046 {
10047 /* Generate a lisp vector of 256 elements. Each element is nil,
10048 integer, or a list of charset IDs.
10049
10050 If Nth element is nil, the byte code N is invalid in this
10051 coding system.
10052
10053 If Nth element is a number NUM, N is the first byte of a
10054 charset whose ID is NUM.
10055
10056 If Nth element is a list of charset IDs, N is the first byte
10057 of one of them. The list is sorted by dimensions of the
10058 charsets. A charset of smaller dimension comes first. */
10059 val = Fmake_vector (make_number (256), Qnil);
10060
10061 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10062 {
10063 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10064 int dim = CHARSET_DIMENSION (charset);
10065 int idx = (dim - 1) * 4;
10066
10067 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10068 ASET (attrs, coding_attr_ascii_compat, Qt);
10069
10070 for (i = charset->code_space[idx];
10071 i <= charset->code_space[idx + 1]; i++)
10072 {
10073 Lisp_Object tmp, tmp2;
10074 int dim2;
10075
10076 tmp = AREF (val, i);
10077 if (NILP (tmp))
10078 tmp = XCAR (tail);
10079 else if (NUMBERP (tmp))
10080 {
10081 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10082 if (dim < dim2)
10083 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
10084 else
10085 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
10086 }
10087 else
10088 {
10089 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10090 {
10091 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10092 if (dim < dim2)
10093 break;
10094 }
10095 if (NILP (tmp2))
10096 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
10097 else
10098 {
10099 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10100 XSETCAR (tmp2, XCAR (tail));
10101 }
10102 }
10103 ASET (val, i, tmp);
10104 }
10105 }
10106 ASET (attrs, coding_attr_charset_valids, val);
10107 category = coding_category_charset;
10108 }
10109 else if (EQ (coding_type, Qccl))
10110 {
10111 Lisp_Object valids;
10112
10113 if (nargs < coding_arg_ccl_max)
10114 goto short_args;
10115
10116 val = args[coding_arg_ccl_decoder];
10117 CHECK_CCL_PROGRAM (val);
10118 if (VECTORP (val))
10119 val = Fcopy_sequence (val);
10120 ASET (attrs, coding_attr_ccl_decoder, val);
10121
10122 val = args[coding_arg_ccl_encoder];
10123 CHECK_CCL_PROGRAM (val);
10124 if (VECTORP (val))
10125 val = Fcopy_sequence (val);
10126 ASET (attrs, coding_attr_ccl_encoder, val);
10127
10128 val = args[coding_arg_ccl_valids];
10129 valids = Fmake_string (make_number (256), make_number (0));
10130 for (tail = val; CONSP (tail); tail = XCDR (tail))
10131 {
10132 int from, to;
10133
10134 val = XCAR (tail);
10135 if (INTEGERP (val))
10136 {
10137 if (! (0 <= XINT (val) && XINT (val) <= 255))
10138 args_out_of_range_3 (val, make_number (0), make_number (255));
10139 from = to = XINT (val);
10140 }
10141 else
10142 {
10143 CHECK_CONS (val);
10144 CHECK_NATNUM_CAR (val);
10145 CHECK_NUMBER_CDR (val);
10146 if (XINT (XCAR (val)) > 255)
10147 args_out_of_range_3 (XCAR (val),
10148 make_number (0), make_number (255));
10149 from = XINT (XCAR (val));
10150 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10151 args_out_of_range_3 (XCDR (val),
10152 XCAR (val), make_number (255));
10153 to = XINT (XCDR (val));
10154 }
10155 for (i = from; i <= to; i++)
10156 SSET (valids, i, 1);
10157 }
10158 ASET (attrs, coding_attr_ccl_valids, valids);
10159
10160 category = coding_category_ccl;
10161 }
10162 else if (EQ (coding_type, Qutf_16))
10163 {
10164 Lisp_Object bom, endian;
10165
10166 ASET (attrs, coding_attr_ascii_compat, Qnil);
10167
10168 if (nargs < coding_arg_utf16_max)
10169 goto short_args;
10170
10171 bom = args[coding_arg_utf16_bom];
10172 if (! NILP (bom) && ! EQ (bom, Qt))
10173 {
10174 CHECK_CONS (bom);
10175 val = XCAR (bom);
10176 CHECK_CODING_SYSTEM (val);
10177 val = XCDR (bom);
10178 CHECK_CODING_SYSTEM (val);
10179 }
10180 ASET (attrs, coding_attr_utf_bom, bom);
10181
10182 endian = args[coding_arg_utf16_endian];
10183 CHECK_SYMBOL (endian);
10184 if (NILP (endian))
10185 endian = Qbig;
10186 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10187 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10188 ASET (attrs, coding_attr_utf_16_endian, endian);
10189
10190 category = (CONSP (bom)
10191 ? coding_category_utf_16_auto
10192 : NILP (bom)
10193 ? (EQ (endian, Qbig)
10194 ? coding_category_utf_16_be_nosig
10195 : coding_category_utf_16_le_nosig)
10196 : (EQ (endian, Qbig)
10197 ? coding_category_utf_16_be
10198 : coding_category_utf_16_le));
10199 }
10200 else if (EQ (coding_type, Qiso_2022))
10201 {
10202 Lisp_Object initial, reg_usage, request, flags;
10203
10204 if (nargs < coding_arg_iso2022_max)
10205 goto short_args;
10206
10207 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10208 CHECK_VECTOR (initial);
10209 for (i = 0; i < 4; i++)
10210 {
10211 val = AREF (initial, i);
10212 if (! NILP (val))
10213 {
10214 struct charset *charset;
10215
10216 CHECK_CHARSET_GET_CHARSET (val, charset);
10217 ASET (initial, i, make_number (CHARSET_ID (charset)));
10218 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10219 ASET (attrs, coding_attr_ascii_compat, Qt);
10220 }
10221 else
10222 ASET (initial, i, make_number (-1));
10223 }
10224
10225 reg_usage = args[coding_arg_iso2022_reg_usage];
10226 CHECK_CONS (reg_usage);
10227 CHECK_NUMBER_CAR (reg_usage);
10228 CHECK_NUMBER_CDR (reg_usage);
10229
10230 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10231 for (tail = request; CONSP (tail); tail = XCDR (tail))
10232 {
10233 int id;
10234 Lisp_Object tmp1;
10235
10236 val = XCAR (tail);
10237 CHECK_CONS (val);
10238 tmp1 = XCAR (val);
10239 CHECK_CHARSET_GET_ID (tmp1, id);
10240 CHECK_NATNUM_CDR (val);
10241 if (XINT (XCDR (val)) >= 4)
10242 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10243 XSETCAR (val, make_number (id));
10244 }
10245
10246 flags = args[coding_arg_iso2022_flags];
10247 CHECK_NATNUM (flags);
10248 i = XINT (flags) & INT_MAX;
10249 if (EQ (args[coding_arg_charset_list], Qiso_2022))
10250 i |= CODING_ISO_FLAG_FULL_SUPPORT;
10251 flags = make_number (i);
10252
10253 ASET (attrs, coding_attr_iso_initial, initial);
10254 ASET (attrs, coding_attr_iso_usage, reg_usage);
10255 ASET (attrs, coding_attr_iso_request, request);
10256 ASET (attrs, coding_attr_iso_flags, flags);
10257 setup_iso_safe_charsets (attrs);
10258
10259 if (i & CODING_ISO_FLAG_SEVEN_BITS)
10260 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10261 | CODING_ISO_FLAG_SINGLE_SHIFT))
10262 ? coding_category_iso_7_else
10263 : EQ (args[coding_arg_charset_list], Qiso_2022)
10264 ? coding_category_iso_7
10265 : coding_category_iso_7_tight);
10266 else
10267 {
10268 int id = XINT (AREF (initial, 1));
10269
10270 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10271 || EQ (args[coding_arg_charset_list], Qiso_2022)
10272 || id < 0)
10273 ? coding_category_iso_8_else
10274 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10275 ? coding_category_iso_8_1
10276 : coding_category_iso_8_2);
10277 }
10278 if (category != coding_category_iso_8_1
10279 && category != coding_category_iso_8_2)
10280 ASET (attrs, coding_attr_ascii_compat, Qnil);
10281 }
10282 else if (EQ (coding_type, Qemacs_mule))
10283 {
10284 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10285 ASET (attrs, coding_attr_emacs_mule_full, Qt);
10286 ASET (attrs, coding_attr_ascii_compat, Qt);
10287 category = coding_category_emacs_mule;
10288 }
10289 else if (EQ (coding_type, Qshift_jis))
10290 {
10291
10292 struct charset *charset;
10293
10294 if (XINT (Flength (charset_list)) != 3
10295 && XINT (Flength (charset_list)) != 4)
10296 error ("There should be three or four charsets");
10297
10298 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10299 if (CHARSET_DIMENSION (charset) != 1)
10300 error ("Dimension of charset %s is not one",
10301 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10302 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10303 ASET (attrs, coding_attr_ascii_compat, Qt);
10304
10305 charset_list = XCDR (charset_list);
10306 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10307 if (CHARSET_DIMENSION (charset) != 1)
10308 error ("Dimension of charset %s is not one",
10309 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10310
10311 charset_list = XCDR (charset_list);
10312 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10313 if (CHARSET_DIMENSION (charset) != 2)
10314 error ("Dimension of charset %s is not two",
10315 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10316
10317 charset_list = XCDR (charset_list);
10318 if (! NILP (charset_list))
10319 {
10320 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10321 if (CHARSET_DIMENSION (charset) != 2)
10322 error ("Dimension of charset %s is not two",
10323 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10324 }
10325
10326 category = coding_category_sjis;
10327 Vsjis_coding_system = name;
10328 }
10329 else if (EQ (coding_type, Qbig5))
10330 {
10331 struct charset *charset;
10332
10333 if (XINT (Flength (charset_list)) != 2)
10334 error ("There should be just two charsets");
10335
10336 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10337 if (CHARSET_DIMENSION (charset) != 1)
10338 error ("Dimension of charset %s is not one",
10339 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10340 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10341 ASET (attrs, coding_attr_ascii_compat, Qt);
10342
10343 charset_list = XCDR (charset_list);
10344 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10345 if (CHARSET_DIMENSION (charset) != 2)
10346 error ("Dimension of charset %s is not two",
10347 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10348
10349 category = coding_category_big5;
10350 Vbig5_coding_system = name;
10351 }
10352 else if (EQ (coding_type, Qraw_text))
10353 {
10354 category = coding_category_raw_text;
10355 ASET (attrs, coding_attr_ascii_compat, Qt);
10356 }
10357 else if (EQ (coding_type, Qutf_8))
10358 {
10359 Lisp_Object bom;
10360
10361 if (nargs < coding_arg_utf8_max)
10362 goto short_args;
10363
10364 bom = args[coding_arg_utf8_bom];
10365 if (! NILP (bom) && ! EQ (bom, Qt))
10366 {
10367 CHECK_CONS (bom);
10368 val = XCAR (bom);
10369 CHECK_CODING_SYSTEM (val);
10370 val = XCDR (bom);
10371 CHECK_CODING_SYSTEM (val);
10372 }
10373 ASET (attrs, coding_attr_utf_bom, bom);
10374 if (NILP (bom))
10375 ASET (attrs, coding_attr_ascii_compat, Qt);
10376
10377 category = (CONSP (bom) ? coding_category_utf_8_auto
10378 : NILP (bom) ? coding_category_utf_8_nosig
10379 : coding_category_utf_8_sig);
10380 }
10381 else if (EQ (coding_type, Qundecided))
10382 {
10383 if (nargs < coding_arg_undecided_max)
10384 goto short_args;
10385 ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10386 args[coding_arg_undecided_inhibit_null_byte_detection]);
10387 ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10388 args[coding_arg_undecided_inhibit_iso_escape_detection]);
10389 ASET (attrs, coding_attr_undecided_prefer_utf_8,
10390 args[coding_arg_undecided_prefer_utf_8]);
10391 category = coding_category_undecided;
10392 }
10393 else
10394 error ("Invalid coding system type: %s",
10395 SDATA (SYMBOL_NAME (coding_type)));
10396
10397 ASET (attrs, coding_attr_category, make_number (category));
10398 ASET (attrs, coding_attr_plist,
10399 Fcons (QCcategory,
10400 Fcons (AREF (Vcoding_category_table, category),
10401 CODING_ATTR_PLIST (attrs))));
10402 ASET (attrs, coding_attr_plist,
10403 Fcons (QCascii_compatible_p,
10404 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10405 CODING_ATTR_PLIST (attrs))));
10406
10407 eol_type = args[coding_arg_eol_type];
10408 if (! NILP (eol_type)
10409 && ! EQ (eol_type, Qunix)
10410 && ! EQ (eol_type, Qdos)
10411 && ! EQ (eol_type, Qmac))
10412 error ("Invalid eol-type");
10413
10414 aliases = Fcons (name, Qnil);
10415
10416 if (NILP (eol_type))
10417 {
10418 eol_type = make_subsidiaries (name);
10419 for (i = 0; i < 3; i++)
10420 {
10421 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10422
10423 this_name = AREF (eol_type, i);
10424 this_aliases = Fcons (this_name, Qnil);
10425 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10426 this_spec = make_uninit_vector (3);
10427 ASET (this_spec, 0, attrs);
10428 ASET (this_spec, 1, this_aliases);
10429 ASET (this_spec, 2, this_eol_type);
10430 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10431 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10432 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10433 if (NILP (val))
10434 Vcoding_system_alist
10435 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10436 Vcoding_system_alist);
10437 }
10438 }
10439
10440 spec_vec = make_uninit_vector (3);
10441 ASET (spec_vec, 0, attrs);
10442 ASET (spec_vec, 1, aliases);
10443 ASET (spec_vec, 2, eol_type);
10444
10445 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10446 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10447 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10448 if (NILP (val))
10449 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10450 Vcoding_system_alist);
10451
10452 {
10453 int id = coding_categories[category].id;
10454
10455 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10456 setup_coding_system (name, &coding_categories[category]);
10457 }
10458
10459 return Qnil;
10460
10461 short_args:
10462 return Fsignal (Qwrong_number_of_arguments,
10463 Fcons (intern ("define-coding-system-internal"),
10464 make_number (nargs)));
10465 }
10466
10467
10468 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10469 3, 3, 0,
10470 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10471 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10472 {
10473 Lisp_Object spec, attrs;
10474
10475 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10476 attrs = AREF (spec, 0);
10477 if (EQ (prop, QCmnemonic))
10478 {
10479 if (! STRINGP (val))
10480 CHECK_CHARACTER (val);
10481 ASET (attrs, coding_attr_mnemonic, val);
10482 }
10483 else if (EQ (prop, QCdefault_char))
10484 {
10485 if (NILP (val))
10486 val = make_number (' ');
10487 else
10488 CHECK_CHARACTER (val);
10489 ASET (attrs, coding_attr_default_char, val);
10490 }
10491 else if (EQ (prop, QCdecode_translation_table))
10492 {
10493 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10494 CHECK_SYMBOL (val);
10495 ASET (attrs, coding_attr_decode_tbl, val);
10496 }
10497 else if (EQ (prop, QCencode_translation_table))
10498 {
10499 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10500 CHECK_SYMBOL (val);
10501 ASET (attrs, coding_attr_encode_tbl, val);
10502 }
10503 else if (EQ (prop, QCpost_read_conversion))
10504 {
10505 CHECK_SYMBOL (val);
10506 ASET (attrs, coding_attr_post_read, val);
10507 }
10508 else if (EQ (prop, QCpre_write_conversion))
10509 {
10510 CHECK_SYMBOL (val);
10511 ASET (attrs, coding_attr_pre_write, val);
10512 }
10513 else if (EQ (prop, QCascii_compatible_p))
10514 {
10515 ASET (attrs, coding_attr_ascii_compat, val);
10516 }
10517
10518 ASET (attrs, coding_attr_plist,
10519 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10520 return val;
10521 }
10522
10523
10524 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10525 Sdefine_coding_system_alias, 2, 2, 0,
10526 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10527 (Lisp_Object alias, Lisp_Object coding_system)
10528 {
10529 Lisp_Object spec, aliases, eol_type, val;
10530
10531 CHECK_SYMBOL (alias);
10532 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10533 aliases = AREF (spec, 1);
10534 /* ALIASES should be a list of length more than zero, and the first
10535 element is a base coding system. Append ALIAS at the tail of the
10536 list. */
10537 while (!NILP (XCDR (aliases)))
10538 aliases = XCDR (aliases);
10539 XSETCDR (aliases, Fcons (alias, Qnil));
10540
10541 eol_type = AREF (spec, 2);
10542 if (VECTORP (eol_type))
10543 {
10544 Lisp_Object subsidiaries;
10545 int i;
10546
10547 subsidiaries = make_subsidiaries (alias);
10548 for (i = 0; i < 3; i++)
10549 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10550 AREF (eol_type, i));
10551 }
10552
10553 Fputhash (alias, spec, Vcoding_system_hash_table);
10554 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10555 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10556 if (NILP (val))
10557 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10558 Vcoding_system_alist);
10559
10560 return Qnil;
10561 }
10562
10563 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10564 1, 1, 0,
10565 doc: /* Return the base of CODING-SYSTEM.
10566 Any alias or subsidiary coding system is not a base coding system. */)
10567 (Lisp_Object coding_system)
10568 {
10569 Lisp_Object spec, attrs;
10570
10571 if (NILP (coding_system))
10572 return (Qno_conversion);
10573 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10574 attrs = AREF (spec, 0);
10575 return CODING_ATTR_BASE_NAME (attrs);
10576 }
10577
10578 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10579 1, 1, 0,
10580 doc: "Return the property list of CODING-SYSTEM.")
10581 (Lisp_Object coding_system)
10582 {
10583 Lisp_Object spec, attrs;
10584
10585 if (NILP (coding_system))
10586 coding_system = Qno_conversion;
10587 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10588 attrs = AREF (spec, 0);
10589 return CODING_ATTR_PLIST (attrs);
10590 }
10591
10592
10593 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10594 1, 1, 0,
10595 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10596 (Lisp_Object coding_system)
10597 {
10598 Lisp_Object spec;
10599
10600 if (NILP (coding_system))
10601 coding_system = Qno_conversion;
10602 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10603 return AREF (spec, 1);
10604 }
10605
10606 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10607 Scoding_system_eol_type, 1, 1, 0,
10608 doc: /* Return eol-type of CODING-SYSTEM.
10609 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10610
10611 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10612 and CR respectively.
10613
10614 A vector value indicates that a format of end-of-line should be
10615 detected automatically. Nth element of the vector is the subsidiary
10616 coding system whose eol-type is N. */)
10617 (Lisp_Object coding_system)
10618 {
10619 Lisp_Object spec, eol_type;
10620 int n;
10621
10622 if (NILP (coding_system))
10623 coding_system = Qno_conversion;
10624 if (! CODING_SYSTEM_P (coding_system))
10625 return Qnil;
10626 spec = CODING_SYSTEM_SPEC (coding_system);
10627 eol_type = AREF (spec, 2);
10628 if (VECTORP (eol_type))
10629 return Fcopy_sequence (eol_type);
10630 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10631 return make_number (n);
10632 }
10633
10634 #endif /* emacs */
10635
10636 \f
10637 /*** 9. Post-amble ***/
10638
10639 void
10640 init_coding_once (void)
10641 {
10642 int i;
10643
10644 for (i = 0; i < coding_category_max; i++)
10645 {
10646 coding_categories[i].id = -1;
10647 coding_priorities[i] = i;
10648 }
10649
10650 /* ISO2022 specific initialize routine. */
10651 for (i = 0; i < 0x20; i++)
10652 iso_code_class[i] = ISO_control_0;
10653 for (i = 0x21; i < 0x7F; i++)
10654 iso_code_class[i] = ISO_graphic_plane_0;
10655 for (i = 0x80; i < 0xA0; i++)
10656 iso_code_class[i] = ISO_control_1;
10657 for (i = 0xA1; i < 0xFF; i++)
10658 iso_code_class[i] = ISO_graphic_plane_1;
10659 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10660 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10661 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10662 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10663 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10664 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10665 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10666 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10667 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10668
10669 for (i = 0; i < 256; i++)
10670 {
10671 emacs_mule_bytes[i] = 1;
10672 }
10673 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10674 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10675 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10676 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10677 }
10678
10679 #ifdef emacs
10680
10681 void
10682 syms_of_coding (void)
10683 {
10684 staticpro (&Vcoding_system_hash_table);
10685 {
10686 Lisp_Object args[2];
10687 args[0] = QCtest;
10688 args[1] = Qeq;
10689 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10690 }
10691
10692 staticpro (&Vsjis_coding_system);
10693 Vsjis_coding_system = Qnil;
10694
10695 staticpro (&Vbig5_coding_system);
10696 Vbig5_coding_system = Qnil;
10697
10698 staticpro (&Vcode_conversion_reused_workbuf);
10699 Vcode_conversion_reused_workbuf = Qnil;
10700
10701 staticpro (&Vcode_conversion_workbuf_name);
10702 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10703
10704 reused_workbuf_in_use = 0;
10705
10706 DEFSYM (Qcharset, "charset");
10707 DEFSYM (Qtarget_idx, "target-idx");
10708 DEFSYM (Qcoding_system_history, "coding-system-history");
10709 Fset (Qcoding_system_history, Qnil);
10710
10711 /* Target FILENAME is the first argument. */
10712 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10713 /* Target FILENAME is the third argument. */
10714 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10715
10716 DEFSYM (Qcall_process, "call-process");
10717 /* Target PROGRAM is the first argument. */
10718 Fput (Qcall_process, Qtarget_idx, make_number (0));
10719
10720 DEFSYM (Qcall_process_region, "call-process-region");
10721 /* Target PROGRAM is the third argument. */
10722 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10723
10724 DEFSYM (Qstart_process, "start-process");
10725 /* Target PROGRAM is the third argument. */
10726 Fput (Qstart_process, Qtarget_idx, make_number (2));
10727
10728 DEFSYM (Qopen_network_stream, "open-network-stream");
10729 /* Target SERVICE is the fourth argument. */
10730 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10731
10732 DEFSYM (Qcoding_system, "coding-system");
10733 DEFSYM (Qcoding_aliases, "coding-aliases");
10734
10735 DEFSYM (Qeol_type, "eol-type");
10736 DEFSYM (Qunix, "unix");
10737 DEFSYM (Qdos, "dos");
10738 DEFSYM (Qmac, "mac");
10739
10740 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10741 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10742 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10743 DEFSYM (Qdefault_char, "default-char");
10744 DEFSYM (Qundecided, "undecided");
10745 DEFSYM (Qno_conversion, "no-conversion");
10746 DEFSYM (Qraw_text, "raw-text");
10747
10748 DEFSYM (Qiso_2022, "iso-2022");
10749
10750 DEFSYM (Qutf_8, "utf-8");
10751 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10752
10753 #if defined (WINDOWSNT) || defined (CYGWIN)
10754 /* No, not utf-16-le: that one has a BOM. */
10755 DEFSYM (Qutf_16le, "utf-16le");
10756 #endif
10757
10758 DEFSYM (Qutf_16, "utf-16");
10759 DEFSYM (Qbig, "big");
10760 DEFSYM (Qlittle, "little");
10761
10762 DEFSYM (Qshift_jis, "shift-jis");
10763 DEFSYM (Qbig5, "big5");
10764
10765 DEFSYM (Qcoding_system_p, "coding-system-p");
10766
10767 DEFSYM (Qcoding_system_error, "coding-system-error");
10768 Fput (Qcoding_system_error, Qerror_conditions,
10769 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10770 Fput (Qcoding_system_error, Qerror_message,
10771 build_pure_c_string ("Invalid coding system"));
10772
10773 /* Intern this now in case it isn't already done.
10774 Setting this variable twice is harmless.
10775 But don't staticpro it here--that is done in alloc.c. */
10776 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10777
10778 DEFSYM (Qtranslation_table, "translation-table");
10779 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10780 DEFSYM (Qtranslation_table_id, "translation-table-id");
10781 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10782 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10783
10784 DEFSYM (Qvalid_codes, "valid-codes");
10785
10786 DEFSYM (Qemacs_mule, "emacs-mule");
10787
10788 DEFSYM (QCcategory, ":category");
10789 DEFSYM (QCmnemonic, ":mnemonic");
10790 DEFSYM (QCdefault_char, ":default-char");
10791 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10792 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10793 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10794 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10795 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10796
10797 Vcoding_category_table
10798 = Fmake_vector (make_number (coding_category_max), Qnil);
10799 staticpro (&Vcoding_category_table);
10800 /* Followings are target of code detection. */
10801 ASET (Vcoding_category_table, coding_category_iso_7,
10802 intern_c_string ("coding-category-iso-7"));
10803 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10804 intern_c_string ("coding-category-iso-7-tight"));
10805 ASET (Vcoding_category_table, coding_category_iso_8_1,
10806 intern_c_string ("coding-category-iso-8-1"));
10807 ASET (Vcoding_category_table, coding_category_iso_8_2,
10808 intern_c_string ("coding-category-iso-8-2"));
10809 ASET (Vcoding_category_table, coding_category_iso_7_else,
10810 intern_c_string ("coding-category-iso-7-else"));
10811 ASET (Vcoding_category_table, coding_category_iso_8_else,
10812 intern_c_string ("coding-category-iso-8-else"));
10813 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10814 intern_c_string ("coding-category-utf-8-auto"));
10815 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10816 intern_c_string ("coding-category-utf-8"));
10817 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10818 intern_c_string ("coding-category-utf-8-sig"));
10819 ASET (Vcoding_category_table, coding_category_utf_16_be,
10820 intern_c_string ("coding-category-utf-16-be"));
10821 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10822 intern_c_string ("coding-category-utf-16-auto"));
10823 ASET (Vcoding_category_table, coding_category_utf_16_le,
10824 intern_c_string ("coding-category-utf-16-le"));
10825 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10826 intern_c_string ("coding-category-utf-16-be-nosig"));
10827 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10828 intern_c_string ("coding-category-utf-16-le-nosig"));
10829 ASET (Vcoding_category_table, coding_category_charset,
10830 intern_c_string ("coding-category-charset"));
10831 ASET (Vcoding_category_table, coding_category_sjis,
10832 intern_c_string ("coding-category-sjis"));
10833 ASET (Vcoding_category_table, coding_category_big5,
10834 intern_c_string ("coding-category-big5"));
10835 ASET (Vcoding_category_table, coding_category_ccl,
10836 intern_c_string ("coding-category-ccl"));
10837 ASET (Vcoding_category_table, coding_category_emacs_mule,
10838 intern_c_string ("coding-category-emacs-mule"));
10839 /* Followings are NOT target of code detection. */
10840 ASET (Vcoding_category_table, coding_category_raw_text,
10841 intern_c_string ("coding-category-raw-text"));
10842 ASET (Vcoding_category_table, coding_category_undecided,
10843 intern_c_string ("coding-category-undecided"));
10844
10845 DEFSYM (Qinsufficient_source, "insufficient-source");
10846 DEFSYM (Qinvalid_source, "invalid-source");
10847 DEFSYM (Qinterrupted, "interrupted");
10848 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10849
10850 defsubr (&Scoding_system_p);
10851 defsubr (&Sread_coding_system);
10852 defsubr (&Sread_non_nil_coding_system);
10853 defsubr (&Scheck_coding_system);
10854 defsubr (&Sdetect_coding_region);
10855 defsubr (&Sdetect_coding_string);
10856 defsubr (&Sfind_coding_systems_region_internal);
10857 defsubr (&Sunencodable_char_position);
10858 defsubr (&Scheck_coding_systems_region);
10859 defsubr (&Sdecode_coding_region);
10860 defsubr (&Sencode_coding_region);
10861 defsubr (&Sdecode_coding_string);
10862 defsubr (&Sencode_coding_string);
10863 defsubr (&Sdecode_sjis_char);
10864 defsubr (&Sencode_sjis_char);
10865 defsubr (&Sdecode_big5_char);
10866 defsubr (&Sencode_big5_char);
10867 defsubr (&Sset_terminal_coding_system_internal);
10868 defsubr (&Sset_safe_terminal_coding_system_internal);
10869 defsubr (&Sterminal_coding_system);
10870 defsubr (&Sset_keyboard_coding_system_internal);
10871 defsubr (&Skeyboard_coding_system);
10872 defsubr (&Sfind_operation_coding_system);
10873 defsubr (&Sset_coding_system_priority);
10874 defsubr (&Sdefine_coding_system_internal);
10875 defsubr (&Sdefine_coding_system_alias);
10876 defsubr (&Scoding_system_put);
10877 defsubr (&Scoding_system_base);
10878 defsubr (&Scoding_system_plist);
10879 defsubr (&Scoding_system_aliases);
10880 defsubr (&Scoding_system_eol_type);
10881 defsubr (&Scoding_system_priority_list);
10882
10883 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10884 doc: /* List of coding systems.
10885
10886 Do not alter the value of this variable manually. This variable should be
10887 updated by the functions `define-coding-system' and
10888 `define-coding-system-alias'. */);
10889 Vcoding_system_list = Qnil;
10890
10891 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10892 doc: /* Alist of coding system names.
10893 Each element is one element list of coding system name.
10894 This variable is given to `completing-read' as COLLECTION argument.
10895
10896 Do not alter the value of this variable manually. This variable should be
10897 updated by the functions `make-coding-system' and
10898 `define-coding-system-alias'. */);
10899 Vcoding_system_alist = Qnil;
10900
10901 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10902 doc: /* List of coding-categories (symbols) ordered by priority.
10903
10904 On detecting a coding system, Emacs tries code detection algorithms
10905 associated with each coding-category one by one in this order. When
10906 one algorithm agrees with a byte sequence of source text, the coding
10907 system bound to the corresponding coding-category is selected.
10908
10909 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10910 {
10911 int i;
10912
10913 Vcoding_category_list = Qnil;
10914 for (i = coding_category_max - 1; i >= 0; i--)
10915 Vcoding_category_list
10916 = Fcons (AREF (Vcoding_category_table, i),
10917 Vcoding_category_list);
10918 }
10919
10920 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10921 doc: /* Specify the coding system for read operations.
10922 It is useful to bind this variable with `let', but do not set it globally.
10923 If the value is a coding system, it is used for decoding on read operation.
10924 If not, an appropriate element is used from one of the coding system alists.
10925 There are three such tables: `file-coding-system-alist',
10926 `process-coding-system-alist', and `network-coding-system-alist'. */);
10927 Vcoding_system_for_read = Qnil;
10928
10929 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10930 doc: /* Specify the coding system for write operations.
10931 Programs bind this variable with `let', but you should not set it globally.
10932 If the value is a coding system, it is used for encoding of output,
10933 when writing it to a file and when sending it to a file or subprocess.
10934
10935 If this does not specify a coding system, an appropriate element
10936 is used from one of the coding system alists.
10937 There are three such tables: `file-coding-system-alist',
10938 `process-coding-system-alist', and `network-coding-system-alist'.
10939 For output to files, if the above procedure does not specify a coding system,
10940 the value of `buffer-file-coding-system' is used. */);
10941 Vcoding_system_for_write = Qnil;
10942
10943 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10944 doc: /*
10945 Coding system used in the latest file or process I/O. */);
10946 Vlast_coding_system_used = Qnil;
10947
10948 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10949 doc: /*
10950 Error status of the last code conversion.
10951
10952 When an error was detected in the last code conversion, this variable
10953 is set to one of the following symbols.
10954 `insufficient-source'
10955 `inconsistent-eol'
10956 `invalid-source'
10957 `interrupted'
10958 `insufficient-memory'
10959 When no error was detected, the value doesn't change. So, to check
10960 the error status of a code conversion by this variable, you must
10961 explicitly set this variable to nil before performing code
10962 conversion. */);
10963 Vlast_code_conversion_error = Qnil;
10964
10965 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10966 doc: /*
10967 *Non-nil means always inhibit code conversion of end-of-line format.
10968 See info node `Coding Systems' and info node `Text and Binary' concerning
10969 such conversion. */);
10970 inhibit_eol_conversion = 0;
10971
10972 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10973 doc: /*
10974 Non-nil means process buffer inherits coding system of process output.
10975 Bind it to t if the process output is to be treated as if it were a file
10976 read from some filesystem. */);
10977 inherit_process_coding_system = 0;
10978
10979 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10980 doc: /*
10981 Alist to decide a coding system to use for a file I/O operation.
10982 The format is ((PATTERN . VAL) ...),
10983 where PATTERN is a regular expression matching a file name,
10984 VAL is a coding system, a cons of coding systems, or a function symbol.
10985 If VAL is a coding system, it is used for both decoding and encoding
10986 the file contents.
10987 If VAL is a cons of coding systems, the car part is used for decoding,
10988 and the cdr part is used for encoding.
10989 If VAL is a function symbol, the function must return a coding system
10990 or a cons of coding systems which are used as above. The function is
10991 called with an argument that is a list of the arguments with which
10992 `find-operation-coding-system' was called. If the function can't decide
10993 a coding system, it can return `undecided' so that the normal
10994 code-detection is performed.
10995
10996 See also the function `find-operation-coding-system'
10997 and the variable `auto-coding-alist'. */);
10998 Vfile_coding_system_alist = Qnil;
10999
11000 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11001 doc: /*
11002 Alist to decide a coding system to use for a process I/O operation.
11003 The format is ((PATTERN . VAL) ...),
11004 where PATTERN is a regular expression matching a program name,
11005 VAL is a coding system, a cons of coding systems, or a function symbol.
11006 If VAL is a coding system, it is used for both decoding what received
11007 from the program and encoding what sent to the program.
11008 If VAL is a cons of coding systems, the car part is used for decoding,
11009 and the cdr part is used for encoding.
11010 If VAL is a function symbol, the function must return a coding system
11011 or a cons of coding systems which are used as above.
11012
11013 See also the function `find-operation-coding-system'. */);
11014 Vprocess_coding_system_alist = Qnil;
11015
11016 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11017 doc: /*
11018 Alist to decide a coding system to use for a network I/O operation.
11019 The format is ((PATTERN . VAL) ...),
11020 where PATTERN is a regular expression matching a network service name
11021 or is a port number to connect to,
11022 VAL is a coding system, a cons of coding systems, or a function symbol.
11023 If VAL is a coding system, it is used for both decoding what received
11024 from the network stream and encoding what sent to the network stream.
11025 If VAL is a cons of coding systems, the car part is used for decoding,
11026 and the cdr part is used for encoding.
11027 If VAL is a function symbol, the function must return a coding system
11028 or a cons of coding systems which are used as above.
11029
11030 See also the function `find-operation-coding-system'. */);
11031 Vnetwork_coding_system_alist = Qnil;
11032
11033 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11034 doc: /* Coding system to use with system messages.
11035 Also used for decoding keyboard input on X Window system. */);
11036 Vlocale_coding_system = Qnil;
11037
11038 /* The eol mnemonics are reset in startup.el system-dependently. */
11039 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11040 doc: /*
11041 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
11042 eol_mnemonic_unix = build_pure_c_string (":");
11043
11044 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11045 doc: /*
11046 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
11047 eol_mnemonic_dos = build_pure_c_string ("\\");
11048
11049 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11050 doc: /*
11051 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
11052 eol_mnemonic_mac = build_pure_c_string ("/");
11053
11054 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11055 doc: /*
11056 *String displayed in mode line when end-of-line format is not yet determined. */);
11057 eol_mnemonic_undecided = build_pure_c_string (":");
11058
11059 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11060 doc: /*
11061 *Non-nil enables character translation while encoding and decoding. */);
11062 Venable_character_translation = Qt;
11063
11064 DEFVAR_LISP ("standard-translation-table-for-decode",
11065 Vstandard_translation_table_for_decode,
11066 doc: /* Table for translating characters while decoding. */);
11067 Vstandard_translation_table_for_decode = Qnil;
11068
11069 DEFVAR_LISP ("standard-translation-table-for-encode",
11070 Vstandard_translation_table_for_encode,
11071 doc: /* Table for translating characters while encoding. */);
11072 Vstandard_translation_table_for_encode = Qnil;
11073
11074 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11075 doc: /* Alist of charsets vs revision numbers.
11076 While encoding, if a charset (car part of an element) is found,
11077 designate it with the escape sequence identifying revision (cdr part
11078 of the element). */);
11079 Vcharset_revision_table = Qnil;
11080
11081 DEFVAR_LISP ("default-process-coding-system",
11082 Vdefault_process_coding_system,
11083 doc: /* Cons of coding systems used for process I/O by default.
11084 The car part is used for decoding a process output,
11085 the cdr part is used for encoding a text to be sent to a process. */);
11086 Vdefault_process_coding_system = Qnil;
11087
11088 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11089 doc: /*
11090 Table of extra Latin codes in the range 128..159 (inclusive).
11091 This is a vector of length 256.
11092 If Nth element is non-nil, the existence of code N in a file
11093 \(or output of subprocess) doesn't prevent it to be detected as
11094 a coding system of ISO 2022 variant which has a flag
11095 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11096 or reading output of a subprocess.
11097 Only 128th through 159th elements have a meaning. */);
11098 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11099
11100 DEFVAR_LISP ("select-safe-coding-system-function",
11101 Vselect_safe_coding_system_function,
11102 doc: /*
11103 Function to call to select safe coding system for encoding a text.
11104
11105 If set, this function is called to force a user to select a proper
11106 coding system which can encode the text in the case that a default
11107 coding system used in each operation can't encode the text. The
11108 function should take care that the buffer is not modified while
11109 the coding system is being selected.
11110
11111 The default value is `select-safe-coding-system' (which see). */);
11112 Vselect_safe_coding_system_function = Qnil;
11113
11114 DEFVAR_BOOL ("coding-system-require-warning",
11115 coding_system_require_warning,
11116 doc: /* Internal use only.
11117 If non-nil, on writing a file, `select-safe-coding-system-function' is
11118 called even if `coding-system-for-write' is non-nil. The command
11119 `universal-coding-system-argument' binds this variable to t temporarily. */);
11120 coding_system_require_warning = 0;
11121
11122
11123 DEFVAR_BOOL ("inhibit-iso-escape-detection",
11124 inhibit_iso_escape_detection,
11125 doc: /*
11126 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11127
11128 When Emacs reads text, it tries to detect how the text is encoded.
11129 This code detection is sensitive to escape sequences. If Emacs sees
11130 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11131 of the ISO2022 encodings, and decodes text by the corresponding coding
11132 system (e.g. `iso-2022-7bit').
11133
11134 However, there may be a case that you want to read escape sequences in
11135 a file as is. In such a case, you can set this variable to non-nil.
11136 Then the code detection will ignore any escape sequences, and no text is
11137 detected as encoded in some ISO-2022 encoding. The result is that all
11138 escape sequences become visible in a buffer.
11139
11140 The default value is nil, and it is strongly recommended not to change
11141 it. That is because many Emacs Lisp source files that contain
11142 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11143 in Emacs's distribution, and they won't be decoded correctly on
11144 reading if you suppress escape sequence detection.
11145
11146 The other way to read escape sequences in a file without decoding is
11147 to explicitly specify some coding system that doesn't use ISO-2022
11148 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
11149 inhibit_iso_escape_detection = 0;
11150
11151 DEFVAR_BOOL ("inhibit-null-byte-detection",
11152 inhibit_null_byte_detection,
11153 doc: /* If non-nil, Emacs ignores null bytes on code detection.
11154 By default, Emacs treats it as binary data, and does not attempt to
11155 decode it. The effect is as if you specified `no-conversion' for
11156 reading that text.
11157
11158 Set this to non-nil when a regular text happens to include null bytes.
11159 Examples are Index nodes of Info files and null-byte delimited output
11160 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
11161 decode text as usual. */);
11162 inhibit_null_byte_detection = 0;
11163
11164 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11165 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11166 Internal use only. Removed after the experimental optimizer gets stable. */);
11167 disable_ascii_optimization = 0;
11168
11169 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11170 doc: /* Char table for translating self-inserting characters.
11171 This is applied to the result of input methods, not their input.
11172 See also `keyboard-translate-table'.
11173
11174 Use of this variable for character code unification was rendered
11175 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11176 internal character representation. */);
11177 Vtranslation_table_for_input = Qnil;
11178
11179 {
11180 Lisp_Object args[coding_arg_undecided_max];
11181 Lisp_Object plist[16];
11182 int i;
11183
11184 for (i = 0; i < coding_arg_undecided_max; i++)
11185 args[i] = Qnil;
11186
11187 plist[0] = intern_c_string (":name");
11188 plist[1] = args[coding_arg_name] = Qno_conversion;
11189 plist[2] = intern_c_string (":mnemonic");
11190 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11191 plist[4] = intern_c_string (":coding-type");
11192 plist[5] = args[coding_arg_coding_type] = Qraw_text;
11193 plist[6] = intern_c_string (":ascii-compatible-p");
11194 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11195 plist[8] = intern_c_string (":default-char");
11196 plist[9] = args[coding_arg_default_char] = make_number (0);
11197 plist[10] = intern_c_string (":for-unibyte");
11198 plist[11] = args[coding_arg_for_unibyte] = Qt;
11199 plist[12] = intern_c_string (":docstring");
11200 plist[13] = build_pure_c_string ("Do no conversion.\n\
11201 \n\
11202 When you visit a file with this coding, the file is read into a\n\
11203 unibyte buffer as is, thus each byte of a file is treated as a\n\
11204 character.");
11205 plist[14] = intern_c_string (":eol-type");
11206 plist[15] = args[coding_arg_eol_type] = Qunix;
11207 args[coding_arg_plist] = Flist (16, plist);
11208 Fdefine_coding_system_internal (coding_arg_max, args);
11209
11210 plist[1] = args[coding_arg_name] = Qundecided;
11211 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11212 plist[5] = args[coding_arg_coding_type] = Qundecided;
11213 /* This is already set.
11214 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11215 plist[8] = intern_c_string (":charset-list");
11216 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11217 plist[11] = args[coding_arg_for_unibyte] = Qnil;
11218 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11219 plist[15] = args[coding_arg_eol_type] = Qnil;
11220 args[coding_arg_plist] = Flist (16, plist);
11221 Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11222 }
11223
11224 setup_coding_system (Qno_conversion, &safe_terminal_coding);
11225
11226 {
11227 int i;
11228
11229 for (i = 0; i < coding_category_max; i++)
11230 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11231 }
11232 #if defined (DOS_NT)
11233 system_eol_type = Qdos;
11234 #else
11235 system_eol_type = Qunix;
11236 #endif
11237 staticpro (&system_eol_type);
11238 }
11239
11240 char *
11241 emacs_strerror (int error_number)
11242 {
11243 char *str;
11244
11245 synchronize_system_messages_locale ();
11246 str = strerror (error_number);
11247
11248 if (! NILP (Vlocale_coding_system))
11249 {
11250 Lisp_Object dec = code_convert_string_norecord (build_string (str),
11251 Vlocale_coding_system,
11252 0);
11253 str = SSDATA (dec);
11254 }
11255
11256 return str;
11257 }
11258
11259 #endif /* emacs */