]> code.delx.au - gnu-emacs/blob - src/coding.c
Merge from emacs--rel--22
[gnu-emacs] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
11
12 This file is part of GNU Emacs.
13
14 GNU Emacs is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 3, or (at your option)
17 any later version.
18
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs; see the file COPYING. If not, write to
26 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 Boston, MA 02110-1301, USA. */
28
29 /*** TABLE OF CONTENTS ***
30
31 0. General comments
32 1. Preamble
33 2. Emacs' internal format (emacs-utf-8) handlers
34 3. UTF-8 handlers
35 4. UTF-16 handlers
36 5. Charset-base coding systems handlers
37 6. emacs-mule (old Emacs' internal format) handlers
38 7. ISO2022 handlers
39 8. Shift-JIS and BIG5 handlers
40 9. CCL handlers
41 10. C library functions
42 11. Emacs Lisp library functions
43 12. Postamble
44
45 */
46
47 /*** 0. General comments ***
48
49
50 CODING SYSTEM
51
52 A coding system is an object for an encoding mechanism that contains
53 information about how to convert byte sequences to character
54 sequences and vice versa. When we say "decode", it means converting
55 a byte sequence of a specific coding system into a character
56 sequence that is represented by Emacs' internal coding system
57 `emacs-utf-8', and when we say "encode", it means converting a
58 character sequence of emacs-utf-8 to a byte sequence of a specific
59 coding system.
60
61 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
62 C level, a coding system is represented by a vector of attributes
63 stored in the hash table Vcharset_hash_table. The conversion from
64 coding system symbol to attributes vector is done by looking up
65 Vcharset_hash_table by the symbol.
66
67 Coding systems are classified into the following types depending on
68 the encoding mechanism. Here's a brief description of the types.
69
70 o UTF-8
71
72 o UTF-16
73
74 o Charset-base coding system
75
76 A coding system defined by one or more (coded) character sets.
77 Decoding and encoding are done by a code converter defined for each
78 character set.
79
80 o Old Emacs internal format (emacs-mule)
81
82 The coding system adopted by old versions of Emacs (20 and 21).
83
84 o ISO2022-base coding system
85
86 The most famous coding system for multiple character sets. X's
87 Compound Text, various EUCs (Extended Unix Code), and coding systems
88 used in the Internet communication such as ISO-2022-JP are all
89 variants of ISO2022.
90
91 o SJIS (or Shift-JIS or MS-Kanji-Code)
92
93 A coding system to encode character sets: ASCII, JISX0201, and
94 JISX0208. Widely used for PC's in Japan. Details are described in
95 section 8.
96
97 o BIG5
98
99 A coding system to encode character sets: ASCII and Big5. Widely
100 used for Chinese (mainly in Taiwan and Hong Kong). Details are
101 described in section 8. In this file, when we write "big5" (all
102 lowercase), we mean the coding system, and when we write "Big5"
103 (capitalized), we mean the character set.
104
105 o CCL
106
107 If a user wants to decode/encode text encoded in a coding system
108 not listed above, he can supply a decoder and an encoder for it in
109 CCL (Code Conversion Language) programs. Emacs executes the CCL
110 program while decoding/encoding.
111
112 o Raw-text
113
114 A coding system for text containing raw eight-bit data. Emacs
115 treats each byte of source text as a character (except for
116 end-of-line conversion).
117
118 o No-conversion
119
120 Like raw text, but don't do end-of-line conversion.
121
122
123 END-OF-LINE FORMAT
124
125 How text end-of-line is encoded depends on operating system. For
126 instance, Unix's format is just one byte of LF (line-feed) code,
127 whereas DOS's format is two-byte sequence of `carriage-return' and
128 `line-feed' codes. MacOS's format is usually one byte of
129 `carriage-return'.
130
131 Since text character encoding and end-of-line encoding are
132 independent, any coding system described above can take any format
133 of end-of-line (except for no-conversion).
134
135 STRUCT CODING_SYSTEM
136
137 Before using a coding system for code conversion (i.e. decoding and
138 encoding), we setup a structure of type `struct coding_system'.
139 This structure keeps various information about a specific code
140 conversion (e.g. the location of source and destination data).
141
142 */
143
144 /* COMMON MACROS */
145
146
147 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
148
149 These functions check if a byte sequence specified as a source in
150 CODING conforms to the format of XXX, and update the members of
151 DETECT_INFO.
152
153 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding;
161 struct coding_detection_info *detect_info;
162 {
163 const unsigned char *src = coding->source;
164 const unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int consumed_chars = 0;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175
176 if (! __C_conforms_to_XXX___ (c))
177 break;
178 if (! __C_strongly_suggests_XXX__ (c))
179 found = CATEGORY_MASK_XXX;
180 }
181 /* The byte sequence is invalid for XXX. */
182 detect_info->rejected |= CATEGORY_MASK_XXX;
183 return 0;
184
185 no_more_source:
186 /* The source exausted successfully. */
187 detect_info->found |= found;
188 return 1;
189 }
190 #endif
191
192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193
194 These functions decode a byte sequence specified as a source by
195 CODING. The resulting multibyte text goes to a place pointed to by
196 CODING->charbuf, the length of which should not exceed
197 CODING->charbuf_size;
198
199 These functions set the information of original and decoded texts in
200 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
201 They also set CODING->result to one of CODING_RESULT_XXX indicating
202 how the decoding is finished.
203
204 Below is the template of these functions. */
205
206 #if 0
207 static void
208 decode_coding_XXXX (coding)
209 struct coding_system *coding;
210 {
211 const unsigned char *src = coding->source + coding->consumed;
212 const unsigned char *src_end = coding->source + coding->src_bytes;
213 /* SRC_BASE remembers the start position in source in each loop.
214 The loop will be exited when there's not enough source code, or
215 when there's no room in CHARBUF for a decoded character. */
216 const unsigned char *src_base;
217 /* A buffer to produce decoded characters. */
218 int *charbuf = coding->charbuf + coding->charbuf_used;
219 int *charbuf_end = coding->charbuf + coding->charbuf_size;
220 int multibytep = coding->src_multibyte;
221
222 while (1)
223 {
224 src_base = src;
225 if (charbuf < charbuf_end)
226 /* No more room to produce a decoded character. */
227 break;
228 ONE_MORE_BYTE (c);
229 /* Decode it. */
230 }
231
232 no_more_source:
233 if (src_base < src_end
234 && coding->mode & CODING_MODE_LAST_BLOCK)
235 /* If the source ends by partial bytes to construct a character,
236 treat them as eight-bit raw data. */
237 while (src_base < src_end && charbuf < charbuf_end)
238 *charbuf++ = *src_base++;
239 /* Remember how many bytes and characters we consumed. If the
240 source is multibyte, the bytes and chars are not identical. */
241 coding->consumed = coding->consumed_char = src_base - coding->source;
242 /* Remember how many characters we produced. */
243 coding->charbuf_used = charbuf - coding->charbuf;
244 }
245 #endif
246
247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248
249 These functions encode SRC_BYTES length text at SOURCE of Emacs'
250 internal multibyte format by CODING. The resulting byte sequence
251 goes to a place pointed to by DESTINATION, the length of which
252 should not exceed DST_BYTES.
253
254 These functions set the information of original and encoded texts in
255 the members produced, produced_char, consumed, and consumed_char of
256 the structure *CODING. They also set the member result to one of
257 CODING_RESULT_XXX indicating how the encoding finished.
258
259 DST_BYTES zero means that source area and destination area are
260 overlapped, which means that we can produce a encoded text until it
261 reaches at the head of not-yet-encoded source text.
262
263 Below is a template of these functions. */
264 #if 0
265 static void
266 encode_coding_XXX (coding)
267 struct coding_system *coding;
268 {
269 int multibytep = coding->dst_multibyte;
270 int *charbuf = coding->charbuf;
271 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
272 unsigned char *dst = coding->destination + coding->produced;
273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
274 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
275 int produced_chars = 0;
276
277 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
278 {
279 int c = *charbuf;
280 /* Encode C into DST, and increment DST. */
281 }
282 label_no_more_destination:
283 /* How many chars and bytes we produced. */
284 coding->produced_char += produced_chars;
285 coding->produced = dst - coding->destination;
286 }
287 #endif
288
289 \f
290 /*** 1. Preamble ***/
291
292 #include <config.h>
293 #include <stdio.h>
294
295 #include "lisp.h"
296 #include "buffer.h"
297 #include "character.h"
298 #include "charset.h"
299 #include "ccl.h"
300 #include "composite.h"
301 #include "coding.h"
302 #include "window.h"
303 #include "frame.h"
304 #include "termhooks.h"
305
306 Lisp_Object Vcoding_system_hash_table;
307
308 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
309 Lisp_Object Qunix, Qdos;
310 extern Lisp_Object Qmac; /* frame.c */
311 Lisp_Object Qbuffer_file_coding_system;
312 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
313 Lisp_Object Qdefault_char;
314 Lisp_Object Qno_conversion, Qundecided;
315 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
316 Lisp_Object Qbig, Qlittle;
317 Lisp_Object Qcoding_system_history;
318 Lisp_Object Qvalid_codes;
319 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
320 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
321 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
322 Lisp_Object QCascii_compatible_p;
323
324 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 Lisp_Object Qtarget_idx;
328
329 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
330 Lisp_Object Qinterrupted, Qinsufficient_memory;
331
332 extern Lisp_Object Qcompletion_ignore_case;
333
334 /* If a symbol has this property, evaluate the value to define the
335 symbol as a coding system. */
336 static Lisp_Object Qcoding_system_define_form;
337
338 int coding_system_require_warning;
339
340 Lisp_Object Vselect_safe_coding_system_function;
341
342 /* Mnemonic string for each format of end-of-line. */
343 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
344 /* Mnemonic string to indicate format of end-of-line is not yet
345 decided. */
346 Lisp_Object eol_mnemonic_undecided;
347
348 /* Format of end-of-line decided by system. This is Qunix on
349 Unix and Mac, Qdos on DOS/Windows.
350 This has an effect only for external encoding (i.e. for output to
351 file and process), not for in-buffer or Lisp string encoding. */
352 static Lisp_Object system_eol_type;
353
354 #ifdef emacs
355
356 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
357
358 Lisp_Object Qcoding_system_p, Qcoding_system_error;
359
360 /* Coding system emacs-mule and raw-text are for converting only
361 end-of-line format. */
362 Lisp_Object Qemacs_mule, Qraw_text;
363 Lisp_Object Qutf_8_emacs;
364
365 /* Coding-systems are handed between Emacs Lisp programs and C internal
366 routines by the following three variables. */
367 /* Coding-system for reading files and receiving data from process. */
368 Lisp_Object Vcoding_system_for_read;
369 /* Coding-system for writing files and sending data to process. */
370 Lisp_Object Vcoding_system_for_write;
371 /* Coding-system actually used in the latest I/O. */
372 Lisp_Object Vlast_coding_system_used;
373 /* Set to non-nil when an error is detected while code conversion. */
374 Lisp_Object Vlast_code_conversion_error;
375 /* A vector of length 256 which contains information about special
376 Latin codes (especially for dealing with Microsoft codes). */
377 Lisp_Object Vlatin_extra_code_table;
378
379 /* Flag to inhibit code conversion of end-of-line format. */
380 int inhibit_eol_conversion;
381
382 /* Flag to inhibit ISO2022 escape sequence detection. */
383 int inhibit_iso_escape_detection;
384
385 /* Flag to make buffer-file-coding-system inherit from process-coding. */
386 int inherit_process_coding_system;
387
388 /* Coding system to be used to encode text for terminal display when
389 terminal coding system is nil. */
390 struct coding_system safe_terminal_coding;
391
392 Lisp_Object Vfile_coding_system_alist;
393 Lisp_Object Vprocess_coding_system_alist;
394 Lisp_Object Vnetwork_coding_system_alist;
395
396 Lisp_Object Vlocale_coding_system;
397
398 #endif /* emacs */
399
400 /* Flag to tell if we look up translation table on character code
401 conversion. */
402 Lisp_Object Venable_character_translation;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode;
407
408 Lisp_Object Qtranslation_table;
409 Lisp_Object Qtranslation_table_id;
410 Lisp_Object Qtranslation_table_for_decode;
411 Lisp_Object Qtranslation_table_for_encode;
412
413 /* Alist of charsets vs revision number. */
414 static Lisp_Object Vcharset_revision_table;
415
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system;
418
419 /* Char table for translating Quail and self-inserting input. */
420 Lisp_Object Vtranslation_table_for_input;
421
422 /* Two special coding systems. */
423 Lisp_Object Vsjis_coding_system;
424 Lisp_Object Vbig5_coding_system;
425
426 /* ISO2022 section */
427
428 #define CODING_ISO_INITIAL(coding, reg) \
429 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
430 coding_attr_iso_initial), \
431 reg)))
432
433
434 #define CODING_ISO_REQUEST(coding, charset_id) \
435 ((charset_id <= (coding)->max_charset_id \
436 ? (coding)->safe_charsets[charset_id] \
437 : -1))
438
439
440 #define CODING_ISO_FLAGS(coding) \
441 ((coding)->spec.iso_2022.flags)
442 #define CODING_ISO_DESIGNATION(coding, reg) \
443 ((coding)->spec.iso_2022.current_designation[reg])
444 #define CODING_ISO_INVOCATION(coding, plane) \
445 ((coding)->spec.iso_2022.current_invocation[plane])
446 #define CODING_ISO_SINGLE_SHIFTING(coding) \
447 ((coding)->spec.iso_2022.single_shifting)
448 #define CODING_ISO_BOL(coding) \
449 ((coding)->spec.iso_2022.bol)
450 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
451 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
452
453 /* Control characters of ISO2022. */
454 /* code */ /* function */
455 #define ISO_CODE_LF 0x0A /* line-feed */
456 #define ISO_CODE_CR 0x0D /* carriage-return */
457 #define ISO_CODE_SO 0x0E /* shift-out */
458 #define ISO_CODE_SI 0x0F /* shift-in */
459 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
460 #define ISO_CODE_ESC 0x1B /* escape */
461 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
462 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
463 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
464
465 /* All code (1-byte) of ISO2022 is classified into one of the
466 followings. */
467 enum iso_code_class_type
468 {
469 ISO_control_0, /* Control codes in the range
470 0x00..0x1F and 0x7F, except for the
471 following 5 codes. */
472 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
473 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
474 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
475 ISO_escape, /* ISO_CODE_SO (0x1B) */
476 ISO_control_1, /* Control codes in the range
477 0x80..0x9F, except for the
478 following 3 codes. */
479 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
480 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
481 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
482 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
483 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
484 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
485 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
486 };
487
488 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
489 `iso-flags' attribute of an iso2022 coding system. */
490
491 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
492 instead of the correct short-form sequence (e.g. ESC $ A). */
493 #define CODING_ISO_FLAG_LONG_FORM 0x0001
494
495 /* If set, reset graphic planes and registers at end-of-line to the
496 initial state. */
497 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
498
499 /* If set, reset graphic planes and registers before any control
500 characters to the initial state. */
501 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
502
503 /* If set, encode by 7-bit environment. */
504 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
505
506 /* If set, use locking-shift function. */
507 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
508
509 /* If set, use single-shift function. Overwrite
510 CODING_ISO_FLAG_LOCKING_SHIFT. */
511 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
512
513 /* If set, use designation escape sequence. */
514 #define CODING_ISO_FLAG_DESIGNATION 0x0040
515
516 /* If set, produce revision number sequence. */
517 #define CODING_ISO_FLAG_REVISION 0x0080
518
519 /* If set, produce ISO6429's direction specifying sequence. */
520 #define CODING_ISO_FLAG_DIRECTION 0x0100
521
522 /* If set, assume designation states are reset at beginning of line on
523 output. */
524 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
525
526 /* If set, designation sequence should be placed at beginning of line
527 on output. */
528 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
529
530 /* If set, do not encode unsafe charactes on output. */
531 #define CODING_ISO_FLAG_SAFE 0x0800
532
533 /* If set, extra latin codes (128..159) are accepted as a valid code
534 on input. */
535 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
536
537 #define CODING_ISO_FLAG_COMPOSITION 0x2000
538
539 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
540
541 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
542
543 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
544
545 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
546
547 /* A character to be produced on output if encoding of the original
548 character is prohibited by CODING_ISO_FLAG_SAFE. */
549 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
550
551
552 /* UTF-16 section */
553 #define CODING_UTF_16_BOM(coding) \
554 ((coding)->spec.utf_16.bom)
555
556 #define CODING_UTF_16_ENDIAN(coding) \
557 ((coding)->spec.utf_16.endian)
558
559 #define CODING_UTF_16_SURROGATE(coding) \
560 ((coding)->spec.utf_16.surrogate)
561
562
563 /* CCL section */
564 #define CODING_CCL_DECODER(coding) \
565 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
566 #define CODING_CCL_ENCODER(coding) \
567 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
568 #define CODING_CCL_VALIDS(coding) \
569 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
570
571 /* Index for each coding category in `coding_categories' */
572
573 enum coding_category
574 {
575 coding_category_iso_7,
576 coding_category_iso_7_tight,
577 coding_category_iso_8_1,
578 coding_category_iso_8_2,
579 coding_category_iso_7_else,
580 coding_category_iso_8_else,
581 coding_category_utf_8,
582 coding_category_utf_16_auto,
583 coding_category_utf_16_be,
584 coding_category_utf_16_le,
585 coding_category_utf_16_be_nosig,
586 coding_category_utf_16_le_nosig,
587 coding_category_charset,
588 coding_category_sjis,
589 coding_category_big5,
590 coding_category_ccl,
591 coding_category_emacs_mule,
592 /* All above are targets of code detection. */
593 coding_category_raw_text,
594 coding_category_undecided,
595 coding_category_max
596 };
597
598 /* Definitions of flag bits used in detect_coding_XXXX. */
599 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
600 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
601 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
602 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
603 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
604 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
605 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
606 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
607 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
608 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
609 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
610 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
611 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
612 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
613 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
614 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
615 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
616 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
617
618 /* This value is returned if detect_coding_mask () find nothing other
619 than ASCII characters. */
620 #define CATEGORY_MASK_ANY \
621 (CATEGORY_MASK_ISO_7 \
622 | CATEGORY_MASK_ISO_7_TIGHT \
623 | CATEGORY_MASK_ISO_8_1 \
624 | CATEGORY_MASK_ISO_8_2 \
625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_AUTO \
629 | CATEGORY_MASK_UTF_16_BE \
630 | CATEGORY_MASK_UTF_16_LE \
631 | CATEGORY_MASK_UTF_16_BE_NOSIG \
632 | CATEGORY_MASK_UTF_16_LE_NOSIG \
633 | CATEGORY_MASK_CHARSET \
634 | CATEGORY_MASK_SJIS \
635 | CATEGORY_MASK_BIG5 \
636 | CATEGORY_MASK_CCL \
637 | CATEGORY_MASK_EMACS_MULE)
638
639
640 #define CATEGORY_MASK_ISO_7BIT \
641 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
642
643 #define CATEGORY_MASK_ISO_8BIT \
644 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
645
646 #define CATEGORY_MASK_ISO_ELSE \
647 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
648
649 #define CATEGORY_MASK_ISO_ESCAPE \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_7_ELSE \
653 | CATEGORY_MASK_ISO_8_ELSE)
654
655 #define CATEGORY_MASK_ISO \
656 ( CATEGORY_MASK_ISO_7BIT \
657 | CATEGORY_MASK_ISO_8BIT \
658 | CATEGORY_MASK_ISO_ELSE)
659
660 #define CATEGORY_MASK_UTF_16 \
661 (CATEGORY_MASK_UTF_16_AUTO \
662 | CATEGORY_MASK_UTF_16_BE \
663 | CATEGORY_MASK_UTF_16_LE \
664 | CATEGORY_MASK_UTF_16_BE_NOSIG \
665 | CATEGORY_MASK_UTF_16_LE_NOSIG)
666
667
668 /* List of symbols `coding-category-xxx' ordered by priority. This
669 variable is exposed to Emacs Lisp. */
670 static Lisp_Object Vcoding_category_list;
671
672 /* Table of coding categories (Lisp symbols). This variable is for
673 internal use oly. */
674 static Lisp_Object Vcoding_category_table;
675
676 /* Table of coding-categories ordered by priority. */
677 static enum coding_category coding_priorities[coding_category_max];
678
679 /* Nth element is a coding context for the coding system bound to the
680 Nth coding category. */
681 static struct coding_system coding_categories[coding_category_max];
682
683 /*** Commonly used macros and functions ***/
684
685 #ifndef min
686 #define min(a, b) ((a) < (b) ? (a) : (b))
687 #endif
688 #ifndef max
689 #define max(a, b) ((a) > (b) ? (a) : (b))
690 #endif
691
692 #define CODING_GET_INFO(coding, attrs, charset_list) \
693 do { \
694 (attrs) = CODING_ID_ATTRS ((coding)->id); \
695 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
696 } while (0)
697
698
699 /* Safely get one byte from the source text pointed by SRC which ends
700 at SRC_END, and set C to that byte. If there are not enough bytes
701 in the source, it jumps to `no_more_source'. If multibytep is
702 nonzero, and a multibyte character is found at SRC, set C to the
703 negative value of the character code. The caller should declare
704 and set these variables appropriately in advance:
705 src, src_end, multibytep */
706
707 #define ONE_MORE_BYTE(c) \
708 do { \
709 if (src == src_end) \
710 { \
711 if (src_base < src) \
712 record_conversion_result \
713 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
714 goto no_more_source; \
715 } \
716 c = *src++; \
717 if (multibytep && (c & 0x80)) \
718 { \
719 if ((c & 0xFE) == 0xC0) \
720 c = ((c & 1) << 6) | *src++; \
721 else \
722 { \
723 src--; \
724 c = - string_char (src, &src, NULL); \
725 record_conversion_result \
726 (coding, CODING_RESULT_INVALID_SRC); \
727 } \
728 } \
729 consumed_chars++; \
730 } while (0)
731
732
733 #define ONE_MORE_BYTE_NO_CHECK(c) \
734 do { \
735 c = *src++; \
736 if (multibytep && (c & 0x80)) \
737 { \
738 if ((c & 0xFE) == 0xC0) \
739 c = ((c & 1) << 6) | *src++; \
740 else \
741 { \
742 src--; \
743 c = - string_char (src, &src, NULL); \
744 record_conversion_result \
745 (coding, CODING_RESULT_INVALID_SRC); \
746 } \
747 } \
748 consumed_chars++; \
749 } while (0)
750
751
752 /* Store a byte C in the place pointed by DST and increment DST to the
753 next free point, and increment PRODUCED_CHARS. The caller should
754 assure that C is 0..127, and declare and set the variable `dst'
755 appropriately in advance.
756 */
757
758
759 #define EMIT_ONE_ASCII_BYTE(c) \
760 do { \
761 produced_chars++; \
762 *dst++ = (c); \
763 } while (0)
764
765
766 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
767
768 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
769 do { \
770 produced_chars += 2; \
771 *dst++ = (c1), *dst++ = (c2); \
772 } while (0)
773
774
775 /* Store a byte C in the place pointed by DST and increment DST to the
776 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
777 nonzero, store in an appropriate multibyte from. The caller should
778 declare and set the variables `dst' and `multibytep' appropriately
779 in advance. */
780
781 #define EMIT_ONE_BYTE(c) \
782 do { \
783 produced_chars++; \
784 if (multibytep) \
785 { \
786 int ch = (c); \
787 if (ch >= 0x80) \
788 ch = BYTE8_TO_CHAR (ch); \
789 CHAR_STRING_ADVANCE (ch, dst); \
790 } \
791 else \
792 *dst++ = (c); \
793 } while (0)
794
795
796 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
797
798 #define EMIT_TWO_BYTES(c1, c2) \
799 do { \
800 produced_chars += 2; \
801 if (multibytep) \
802 { \
803 int ch; \
804 \
805 ch = (c1); \
806 if (ch >= 0x80) \
807 ch = BYTE8_TO_CHAR (ch); \
808 CHAR_STRING_ADVANCE (ch, dst); \
809 ch = (c2); \
810 if (ch >= 0x80) \
811 ch = BYTE8_TO_CHAR (ch); \
812 CHAR_STRING_ADVANCE (ch, dst); \
813 } \
814 else \
815 { \
816 *dst++ = (c1); \
817 *dst++ = (c2); \
818 } \
819 } while (0)
820
821
822 #define EMIT_THREE_BYTES(c1, c2, c3) \
823 do { \
824 EMIT_ONE_BYTE (c1); \
825 EMIT_TWO_BYTES (c2, c3); \
826 } while (0)
827
828
829 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
830 do { \
831 EMIT_TWO_BYTES (c1, c2); \
832 EMIT_TWO_BYTES (c3, c4); \
833 } while (0)
834
835
836 /* Prototypes for static functions. */
837 static void record_conversion_result P_ ((struct coding_system *coding,
838 enum coding_result_code result));
839 static int detect_coding_utf_8 P_ ((struct coding_system *,
840 struct coding_detection_info *info));
841 static void decode_coding_utf_8 P_ ((struct coding_system *));
842 static int encode_coding_utf_8 P_ ((struct coding_system *));
843
844 static int detect_coding_utf_16 P_ ((struct coding_system *,
845 struct coding_detection_info *info));
846 static void decode_coding_utf_16 P_ ((struct coding_system *));
847 static int encode_coding_utf_16 P_ ((struct coding_system *));
848
849 static int detect_coding_iso_2022 P_ ((struct coding_system *,
850 struct coding_detection_info *info));
851 static void decode_coding_iso_2022 P_ ((struct coding_system *));
852 static int encode_coding_iso_2022 P_ ((struct coding_system *));
853
854 static int detect_coding_emacs_mule P_ ((struct coding_system *,
855 struct coding_detection_info *info));
856 static void decode_coding_emacs_mule P_ ((struct coding_system *));
857 static int encode_coding_emacs_mule P_ ((struct coding_system *));
858
859 static int detect_coding_sjis P_ ((struct coding_system *,
860 struct coding_detection_info *info));
861 static void decode_coding_sjis P_ ((struct coding_system *));
862 static int encode_coding_sjis P_ ((struct coding_system *));
863
864 static int detect_coding_big5 P_ ((struct coding_system *,
865 struct coding_detection_info *info));
866 static void decode_coding_big5 P_ ((struct coding_system *));
867 static int encode_coding_big5 P_ ((struct coding_system *));
868
869 static int detect_coding_ccl P_ ((struct coding_system *,
870 struct coding_detection_info *info));
871 static void decode_coding_ccl P_ ((struct coding_system *));
872 static int encode_coding_ccl P_ ((struct coding_system *));
873
874 static void decode_coding_raw_text P_ ((struct coding_system *));
875 static int encode_coding_raw_text P_ ((struct coding_system *));
876
877 static void coding_set_source P_ ((struct coding_system *));
878 static void coding_set_destination P_ ((struct coding_system *));
879 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
880 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
881 EMACS_INT, EMACS_INT));
882 static unsigned char *alloc_destination P_ ((struct coding_system *,
883 EMACS_INT, unsigned char *));
884 static void setup_iso_safe_charsets P_ ((Lisp_Object));
885 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
886 int *, int *,
887 unsigned char *));
888 static int detect_eol P_ ((const unsigned char *,
889 EMACS_INT, enum coding_category));
890 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
891 static void decode_eol P_ ((struct coding_system *));
892 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
893 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
894 int, int *, int *));
895 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
896 static INLINE void produce_composition P_ ((struct coding_system *, int *,
897 EMACS_INT));
898 static INLINE void produce_charset P_ ((struct coding_system *, int *,
899 EMACS_INT));
900 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
901 static int decode_coding P_ ((struct coding_system *));
902 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
903 struct coding_system *,
904 int *, EMACS_INT *));
905 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
906 struct coding_system *,
907 int *, EMACS_INT *));
908 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
909 static int encode_coding P_ ((struct coding_system *));
910 static Lisp_Object make_conversion_work_buffer P_ ((int));
911 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
912 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
913 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
914
915 static void
916 record_conversion_result (struct coding_system *coding,
917 enum coding_result_code result)
918 {
919 coding->result = result;
920 switch (result)
921 {
922 case CODING_RESULT_INSUFFICIENT_SRC:
923 Vlast_code_conversion_error = Qinsufficient_source;
924 break;
925 case CODING_RESULT_INCONSISTENT_EOL:
926 Vlast_code_conversion_error = Qinconsistent_eol;
927 break;
928 case CODING_RESULT_INVALID_SRC:
929 Vlast_code_conversion_error = Qinvalid_source;
930 break;
931 case CODING_RESULT_INTERRUPT:
932 Vlast_code_conversion_error = Qinterrupted;
933 break;
934 case CODING_RESULT_INSUFFICIENT_MEM:
935 Vlast_code_conversion_error = Qinsufficient_memory;
936 break;
937 default:
938 Vlast_code_conversion_error = intern ("Unknown error");
939 }
940 }
941
942 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
943 do { \
944 charset_map_loaded = 0; \
945 c = DECODE_CHAR (charset, code); \
946 if (charset_map_loaded) \
947 { \
948 const unsigned char *orig = coding->source; \
949 EMACS_INT offset; \
950 \
951 coding_set_source (coding); \
952 offset = coding->source - orig; \
953 src += offset; \
954 src_base += offset; \
955 src_end += offset; \
956 } \
957 } while (0)
958
959
960 /* If there are at least BYTES length of room at dst, allocate memory
961 for coding->destination and update dst and dst_end. We don't have
962 to take care of coding->source which will be relocated. It is
963 handled by calling coding_set_source in encode_coding. */
964
965 #define ASSURE_DESTINATION(bytes) \
966 do { \
967 if (dst + (bytes) >= dst_end) \
968 { \
969 int more_bytes = charbuf_end - charbuf + (bytes); \
970 \
971 dst = alloc_destination (coding, more_bytes, dst); \
972 dst_end = coding->destination + coding->dst_bytes; \
973 } \
974 } while (0)
975
976
977 /* Store multibyte form of the character C in P, and advance P to the
978 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
979 never calls MAYBE_UNIFY_CHAR. */
980
981 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
982 do { \
983 if ((c) <= MAX_1_BYTE_CHAR) \
984 *(p)++ = (c); \
985 else if ((c) <= MAX_2_BYTE_CHAR) \
986 *(p)++ = (0xC0 | ((c) >> 6)), \
987 *(p)++ = (0x80 | ((c) & 0x3F)); \
988 else if ((c) <= MAX_3_BYTE_CHAR) \
989 *(p)++ = (0xE0 | ((c) >> 12)), \
990 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
991 *(p)++ = (0x80 | ((c) & 0x3F)); \
992 else if ((c) <= MAX_4_BYTE_CHAR) \
993 *(p)++ = (0xF0 | (c >> 18)), \
994 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
995 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
996 *(p)++ = (0x80 | (c & 0x3F)); \
997 else if ((c) <= MAX_5_BYTE_CHAR) \
998 *(p)++ = 0xF8, \
999 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1000 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1001 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1002 *(p)++ = (0x80 | (c & 0x3F)); \
1003 else \
1004 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1005 } while (0)
1006
1007
1008 /* Return the character code of character whose multibyte form is at
1009 P, and advance P to the end of the multibyte form. This is like
1010 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1011
1012 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1013 (!((p)[0] & 0x80) \
1014 ? *(p)++ \
1015 : ! ((p)[0] & 0x20) \
1016 ? ((p) += 2, \
1017 ((((p)[-2] & 0x1F) << 6) \
1018 | ((p)[-1] & 0x3F) \
1019 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1020 : ! ((p)[0] & 0x10) \
1021 ? ((p) += 3, \
1022 ((((p)[-3] & 0x0F) << 12) \
1023 | (((p)[-2] & 0x3F) << 6) \
1024 | ((p)[-1] & 0x3F))) \
1025 : ! ((p)[0] & 0x08) \
1026 ? ((p) += 4, \
1027 ((((p)[-4] & 0xF) << 18) \
1028 | (((p)[-3] & 0x3F) << 12) \
1029 | (((p)[-2] & 0x3F) << 6) \
1030 | ((p)[-1] & 0x3F))) \
1031 : ((p) += 5, \
1032 ((((p)[-4] & 0x3F) << 18) \
1033 | (((p)[-3] & 0x3F) << 12) \
1034 | (((p)[-2] & 0x3F) << 6) \
1035 | ((p)[-1] & 0x3F))))
1036
1037
1038 static void
1039 coding_set_source (coding)
1040 struct coding_system *coding;
1041 {
1042 if (BUFFERP (coding->src_object))
1043 {
1044 struct buffer *buf = XBUFFER (coding->src_object);
1045
1046 if (coding->src_pos < 0)
1047 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1048 else
1049 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1050 }
1051 else if (STRINGP (coding->src_object))
1052 {
1053 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1054 }
1055 else
1056 /* Otherwise, the source is C string and is never relocated
1057 automatically. Thus we don't have to update anything. */
1058 ;
1059 }
1060
1061 static void
1062 coding_set_destination (coding)
1063 struct coding_system *coding;
1064 {
1065 if (BUFFERP (coding->dst_object))
1066 {
1067 if (coding->src_pos < 0)
1068 {
1069 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1070 coding->dst_bytes = (GAP_END_ADDR
1071 - (coding->src_bytes - coding->consumed)
1072 - coding->destination);
1073 }
1074 else
1075 {
1076 /* We are sure that coding->dst_pos_byte is before the gap
1077 of the buffer. */
1078 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1079 + coding->dst_pos_byte - BEG_BYTE);
1080 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1081 - coding->destination);
1082 }
1083 }
1084 else
1085 /* Otherwise, the destination is C string and is never relocated
1086 automatically. Thus we don't have to update anything. */
1087 ;
1088 }
1089
1090
1091 static void
1092 coding_alloc_by_realloc (coding, bytes)
1093 struct coding_system *coding;
1094 EMACS_INT bytes;
1095 {
1096 coding->destination = (unsigned char *) xrealloc (coding->destination,
1097 coding->dst_bytes + bytes);
1098 coding->dst_bytes += bytes;
1099 }
1100
1101 static void
1102 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1103 struct coding_system *coding;
1104 EMACS_INT gap_head_used, bytes;
1105 {
1106 if (EQ (coding->src_object, coding->dst_object))
1107 {
1108 /* The gap may contain the produced data at the head and not-yet
1109 consumed data at the tail. To preserve those data, we at
1110 first make the gap size to zero, then increase the gap
1111 size. */
1112 EMACS_INT add = GAP_SIZE;
1113
1114 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1115 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1116 make_gap (bytes);
1117 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1118 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1119 }
1120 else
1121 {
1122 Lisp_Object this_buffer;
1123
1124 this_buffer = Fcurrent_buffer ();
1125 set_buffer_internal (XBUFFER (coding->dst_object));
1126 make_gap (bytes);
1127 set_buffer_internal (XBUFFER (this_buffer));
1128 }
1129 }
1130
1131
1132 static unsigned char *
1133 alloc_destination (coding, nbytes, dst)
1134 struct coding_system *coding;
1135 EMACS_INT nbytes;
1136 unsigned char *dst;
1137 {
1138 EMACS_INT offset = dst - coding->destination;
1139
1140 if (BUFFERP (coding->dst_object))
1141 {
1142 struct buffer *buf = XBUFFER (coding->dst_object);
1143
1144 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1145 }
1146 else
1147 coding_alloc_by_realloc (coding, nbytes);
1148 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1149 coding_set_destination (coding);
1150 dst = coding->destination + offset;
1151 return dst;
1152 }
1153
1154 /** Macros for annotations. */
1155
1156 /* Maximum length of annotation data (sum of annotations for
1157 composition and charset). */
1158 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1159
1160 /* An annotation data is stored in the array coding->charbuf in this
1161 format:
1162 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1163 LENGTH is the number of elements in the annotation.
1164 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1165 NCHARS is the number of characters in the text annotated.
1166
1167 The format of the following elements depend on ANNOTATION_MASK.
1168
1169 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1170 follows:
1171 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1172 METHOD is one of enum composition_method.
1173 Optionnal COMPOSITION-COMPONENTS are characters and composition
1174 rules.
1175
1176 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1177 follows. */
1178
1179 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1180 do { \
1181 *(buf)++ = -(len); \
1182 *(buf)++ = (mask); \
1183 *(buf)++ = (nchars); \
1184 coding->annotated = 1; \
1185 } while (0);
1186
1187 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1188 do { \
1189 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1190 *buf++ = method; \
1191 } while (0)
1192
1193
1194 #define ADD_CHARSET_DATA(buf, nchars, id) \
1195 do { \
1196 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1197 *buf++ = id; \
1198 } while (0)
1199
1200 \f
1201 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1202
1203
1204
1205 \f
1206 /*** 3. UTF-8 ***/
1207
1208 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1209 Check if a text is encoded in UTF-8. If it is, return 1, else
1210 return 0. */
1211
1212 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1213 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1214 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1215 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1216 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1217 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1218
1219 static int
1220 detect_coding_utf_8 (coding, detect_info)
1221 struct coding_system *coding;
1222 struct coding_detection_info *detect_info;
1223 {
1224 const unsigned char *src = coding->source, *src_base;
1225 const unsigned char *src_end = coding->source + coding->src_bytes;
1226 int multibytep = coding->src_multibyte;
1227 int consumed_chars = 0;
1228 int found = 0;
1229
1230 detect_info->checked |= CATEGORY_MASK_UTF_8;
1231 /* A coding system of this category is always ASCII compatible. */
1232 src += coding->head_ascii;
1233
1234 while (1)
1235 {
1236 int c, c1, c2, c3, c4;
1237
1238 src_base = src;
1239 ONE_MORE_BYTE (c);
1240 if (c < 0 || UTF_8_1_OCTET_P (c))
1241 continue;
1242 ONE_MORE_BYTE (c1);
1243 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1244 break;
1245 if (UTF_8_2_OCTET_LEADING_P (c))
1246 {
1247 found = CATEGORY_MASK_UTF_8;
1248 continue;
1249 }
1250 ONE_MORE_BYTE (c2);
1251 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1252 break;
1253 if (UTF_8_3_OCTET_LEADING_P (c))
1254 {
1255 found = CATEGORY_MASK_UTF_8;
1256 continue;
1257 }
1258 ONE_MORE_BYTE (c3);
1259 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1260 break;
1261 if (UTF_8_4_OCTET_LEADING_P (c))
1262 {
1263 found = CATEGORY_MASK_UTF_8;
1264 continue;
1265 }
1266 ONE_MORE_BYTE (c4);
1267 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1268 break;
1269 if (UTF_8_5_OCTET_LEADING_P (c))
1270 {
1271 found = CATEGORY_MASK_UTF_8;
1272 continue;
1273 }
1274 break;
1275 }
1276 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1277 return 0;
1278
1279 no_more_source:
1280 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1281 {
1282 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1283 return 0;
1284 }
1285 detect_info->found |= found;
1286 return 1;
1287 }
1288
1289
1290 static void
1291 decode_coding_utf_8 (coding)
1292 struct coding_system *coding;
1293 {
1294 const unsigned char *src = coding->source + coding->consumed;
1295 const unsigned char *src_end = coding->source + coding->src_bytes;
1296 const unsigned char *src_base;
1297 int *charbuf = coding->charbuf + coding->charbuf_used;
1298 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1299 int consumed_chars = 0, consumed_chars_base;
1300 int multibytep = coding->src_multibyte;
1301 Lisp_Object attr, charset_list;
1302 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1303 int byte_after_cr = -1;
1304
1305 CODING_GET_INFO (coding, attr, charset_list);
1306
1307 while (1)
1308 {
1309 int c, c1, c2, c3, c4, c5;
1310
1311 src_base = src;
1312 consumed_chars_base = consumed_chars;
1313
1314 if (charbuf >= charbuf_end)
1315 break;
1316
1317 if (byte_after_cr >= 0)
1318 c1 = byte_after_cr, byte_after_cr = -1;
1319 else
1320 ONE_MORE_BYTE (c1);
1321 if (c1 < 0)
1322 {
1323 c = - c1;
1324 }
1325 else if (UTF_8_1_OCTET_P(c1))
1326 {
1327 if (eol_crlf && c1 == '\r')
1328 ONE_MORE_BYTE (byte_after_cr);
1329 c = c1;
1330 }
1331 else
1332 {
1333 ONE_MORE_BYTE (c2);
1334 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1335 goto invalid_code;
1336 if (UTF_8_2_OCTET_LEADING_P (c1))
1337 {
1338 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1339 /* Reject overlong sequences here and below. Encoders
1340 producing them are incorrect, they can be misleading,
1341 and they mess up read/write invariance. */
1342 if (c < 128)
1343 goto invalid_code;
1344 }
1345 else
1346 {
1347 ONE_MORE_BYTE (c3);
1348 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1349 goto invalid_code;
1350 if (UTF_8_3_OCTET_LEADING_P (c1))
1351 {
1352 c = (((c1 & 0xF) << 12)
1353 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1354 if (c < 0x800
1355 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1356 goto invalid_code;
1357 }
1358 else
1359 {
1360 ONE_MORE_BYTE (c4);
1361 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1362 goto invalid_code;
1363 if (UTF_8_4_OCTET_LEADING_P (c1))
1364 {
1365 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1366 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1367 if (c < 0x10000)
1368 goto invalid_code;
1369 }
1370 else
1371 {
1372 ONE_MORE_BYTE (c5);
1373 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1374 goto invalid_code;
1375 if (UTF_8_5_OCTET_LEADING_P (c1))
1376 {
1377 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1378 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1379 | (c5 & 0x3F));
1380 if ((c > MAX_CHAR) || (c < 0x200000))
1381 goto invalid_code;
1382 }
1383 else
1384 goto invalid_code;
1385 }
1386 }
1387 }
1388 }
1389
1390 *charbuf++ = c;
1391 continue;
1392
1393 invalid_code:
1394 src = src_base;
1395 consumed_chars = consumed_chars_base;
1396 ONE_MORE_BYTE (c);
1397 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1398 coding->errors++;
1399 }
1400
1401 no_more_source:
1402 coding->consumed_char += consumed_chars_base;
1403 coding->consumed = src_base - coding->source;
1404 coding->charbuf_used = charbuf - coding->charbuf;
1405 }
1406
1407
1408 static int
1409 encode_coding_utf_8 (coding)
1410 struct coding_system *coding;
1411 {
1412 int multibytep = coding->dst_multibyte;
1413 int *charbuf = coding->charbuf;
1414 int *charbuf_end = charbuf + coding->charbuf_used;
1415 unsigned char *dst = coding->destination + coding->produced;
1416 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1417 int produced_chars = 0;
1418 int c;
1419
1420 if (multibytep)
1421 {
1422 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1423
1424 while (charbuf < charbuf_end)
1425 {
1426 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1427
1428 ASSURE_DESTINATION (safe_room);
1429 c = *charbuf++;
1430 if (CHAR_BYTE8_P (c))
1431 {
1432 c = CHAR_TO_BYTE8 (c);
1433 EMIT_ONE_BYTE (c);
1434 }
1435 else
1436 {
1437 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1438 for (p = str; p < pend; p++)
1439 EMIT_ONE_BYTE (*p);
1440 }
1441 }
1442 }
1443 else
1444 {
1445 int safe_room = MAX_MULTIBYTE_LENGTH;
1446
1447 while (charbuf < charbuf_end)
1448 {
1449 ASSURE_DESTINATION (safe_room);
1450 c = *charbuf++;
1451 if (CHAR_BYTE8_P (c))
1452 *dst++ = CHAR_TO_BYTE8 (c);
1453 else
1454 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1455 produced_chars++;
1456 }
1457 }
1458 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1459 coding->produced_char += produced_chars;
1460 coding->produced = dst - coding->destination;
1461 return 0;
1462 }
1463
1464
1465 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1466 Check if a text is encoded in one of UTF-16 based coding systems.
1467 If it is, return 1, else return 0. */
1468
1469 #define UTF_16_HIGH_SURROGATE_P(val) \
1470 (((val) & 0xFC00) == 0xD800)
1471
1472 #define UTF_16_LOW_SURROGATE_P(val) \
1473 (((val) & 0xFC00) == 0xDC00)
1474
1475 #define UTF_16_INVALID_P(val) \
1476 (((val) == 0xFFFE) \
1477 || ((val) == 0xFFFF) \
1478 || UTF_16_LOW_SURROGATE_P (val))
1479
1480
1481 static int
1482 detect_coding_utf_16 (coding, detect_info)
1483 struct coding_system *coding;
1484 struct coding_detection_info *detect_info;
1485 {
1486 const unsigned char *src = coding->source, *src_base = src;
1487 const unsigned char *src_end = coding->source + coding->src_bytes;
1488 int multibytep = coding->src_multibyte;
1489 int consumed_chars = 0;
1490 int c1, c2;
1491
1492 detect_info->checked |= CATEGORY_MASK_UTF_16;
1493 if (coding->mode & CODING_MODE_LAST_BLOCK
1494 && (coding->src_chars & 1))
1495 {
1496 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1497 return 0;
1498 }
1499
1500 ONE_MORE_BYTE (c1);
1501 ONE_MORE_BYTE (c2);
1502 if ((c1 == 0xFF) && (c2 == 0xFE))
1503 {
1504 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1505 | CATEGORY_MASK_UTF_16_AUTO);
1506 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1507 | CATEGORY_MASK_UTF_16_BE_NOSIG
1508 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1509 }
1510 else if ((c1 == 0xFE) && (c2 == 0xFF))
1511 {
1512 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1513 | CATEGORY_MASK_UTF_16_AUTO);
1514 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1515 | CATEGORY_MASK_UTF_16_BE_NOSIG
1516 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1517 }
1518 else
1519 {
1520 /* We check the dispersion of Eth and Oth bytes where E is even and
1521 O is odd. If both are high, we assume binary data.*/
1522 unsigned char e[256], o[256];
1523 unsigned e_num = 1, o_num = 1;
1524
1525 memset (e, 0, 256);
1526 memset (o, 0, 256);
1527 e[c1] = 1;
1528 o[c2] = 1;
1529
1530 detect_info->rejected
1531 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1532
1533 while (1)
1534 {
1535 ONE_MORE_BYTE (c1);
1536 ONE_MORE_BYTE (c2);
1537 if (! e[c1])
1538 {
1539 e[c1] = 1;
1540 e_num++;
1541 if (e_num >= 128)
1542 break;
1543 }
1544 if (! o[c2])
1545 {
1546 o[c1] = 1;
1547 o_num++;
1548 if (o_num >= 128)
1549 break;
1550 }
1551 }
1552 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1553 return 0;
1554 }
1555
1556 no_more_source:
1557 return 1;
1558 }
1559
1560 static void
1561 decode_coding_utf_16 (coding)
1562 struct coding_system *coding;
1563 {
1564 const unsigned char *src = coding->source + coding->consumed;
1565 const unsigned char *src_end = coding->source + coding->src_bytes;
1566 const unsigned char *src_base;
1567 int *charbuf = coding->charbuf + coding->charbuf_used;
1568 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1569 int consumed_chars = 0, consumed_chars_base;
1570 int multibytep = coding->src_multibyte;
1571 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1572 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1573 int surrogate = CODING_UTF_16_SURROGATE (coding);
1574 Lisp_Object attr, charset_list;
1575 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1576 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1577
1578 CODING_GET_INFO (coding, attr, charset_list);
1579
1580 if (bom == utf_16_with_bom)
1581 {
1582 int c, c1, c2;
1583
1584 src_base = src;
1585 ONE_MORE_BYTE (c1);
1586 ONE_MORE_BYTE (c2);
1587 c = (c1 << 8) | c2;
1588
1589 if (endian == utf_16_big_endian
1590 ? c != 0xFEFF : c != 0xFFFE)
1591 {
1592 /* The first two bytes are not BOM. Treat them as bytes
1593 for a normal character. */
1594 src = src_base;
1595 coding->errors++;
1596 }
1597 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1598 }
1599 else if (bom == utf_16_detect_bom)
1600 {
1601 /* We have already tried to detect BOM and failed in
1602 detect_coding. */
1603 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1604 }
1605
1606 while (1)
1607 {
1608 int c, c1, c2;
1609
1610 src_base = src;
1611 consumed_chars_base = consumed_chars;
1612
1613 if (charbuf + 2 >= charbuf_end)
1614 break;
1615
1616 if (byte_after_cr1 >= 0)
1617 c1 = byte_after_cr1, byte_after_cr1 = -1;
1618 else
1619 ONE_MORE_BYTE (c1);
1620 if (c1 < 0)
1621 {
1622 *charbuf++ = -c1;
1623 continue;
1624 }
1625 if (byte_after_cr2 >= 0)
1626 c2 = byte_after_cr2, byte_after_cr2 = -1;
1627 else
1628 ONE_MORE_BYTE (c2);
1629 if (c2 < 0)
1630 {
1631 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1632 *charbuf++ = -c2;
1633 continue;
1634 }
1635 c = (endian == utf_16_big_endian
1636 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1637
1638 if (surrogate)
1639 {
1640 if (! UTF_16_LOW_SURROGATE_P (c))
1641 {
1642 if (endian == utf_16_big_endian)
1643 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1644 else
1645 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1646 *charbuf++ = c1;
1647 *charbuf++ = c2;
1648 coding->errors++;
1649 if (UTF_16_HIGH_SURROGATE_P (c))
1650 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1651 else
1652 *charbuf++ = c;
1653 }
1654 else
1655 {
1656 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1657 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1658 *charbuf++ = 0x10000 + c;
1659 }
1660 }
1661 else
1662 {
1663 if (UTF_16_HIGH_SURROGATE_P (c))
1664 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1665 else
1666 {
1667 if (eol_crlf && c == '\r')
1668 {
1669 ONE_MORE_BYTE (byte_after_cr1);
1670 ONE_MORE_BYTE (byte_after_cr2);
1671 }
1672 *charbuf++ = c;
1673 }
1674 }
1675 }
1676
1677 no_more_source:
1678 coding->consumed_char += consumed_chars_base;
1679 coding->consumed = src_base - coding->source;
1680 coding->charbuf_used = charbuf - coding->charbuf;
1681 }
1682
1683 static int
1684 encode_coding_utf_16 (coding)
1685 struct coding_system *coding;
1686 {
1687 int multibytep = coding->dst_multibyte;
1688 int *charbuf = coding->charbuf;
1689 int *charbuf_end = charbuf + coding->charbuf_used;
1690 unsigned char *dst = coding->destination + coding->produced;
1691 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1692 int safe_room = 8;
1693 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1694 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1695 int produced_chars = 0;
1696 Lisp_Object attrs, charset_list;
1697 int c;
1698
1699 CODING_GET_INFO (coding, attrs, charset_list);
1700
1701 if (bom != utf_16_without_bom)
1702 {
1703 ASSURE_DESTINATION (safe_room);
1704 if (big_endian)
1705 EMIT_TWO_BYTES (0xFE, 0xFF);
1706 else
1707 EMIT_TWO_BYTES (0xFF, 0xFE);
1708 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1709 }
1710
1711 while (charbuf < charbuf_end)
1712 {
1713 ASSURE_DESTINATION (safe_room);
1714 c = *charbuf++;
1715 if (c >= MAX_UNICODE_CHAR)
1716 c = coding->default_char;
1717
1718 if (c < 0x10000)
1719 {
1720 if (big_endian)
1721 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1722 else
1723 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1724 }
1725 else
1726 {
1727 int c1, c2;
1728
1729 c -= 0x10000;
1730 c1 = (c >> 10) + 0xD800;
1731 c2 = (c & 0x3FF) + 0xDC00;
1732 if (big_endian)
1733 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1734 else
1735 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1736 }
1737 }
1738 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1739 coding->produced = dst - coding->destination;
1740 coding->produced_char += produced_chars;
1741 return 0;
1742 }
1743
1744 \f
1745 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1746
1747 /* Emacs' internal format for representation of multiple character
1748 sets is a kind of multi-byte encoding, i.e. characters are
1749 represented by variable-length sequences of one-byte codes.
1750
1751 ASCII characters and control characters (e.g. `tab', `newline') are
1752 represented by one-byte sequences which are their ASCII codes, in
1753 the range 0x00 through 0x7F.
1754
1755 8-bit characters of the range 0x80..0x9F are represented by
1756 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1757 code + 0x20).
1758
1759 8-bit characters of the range 0xA0..0xFF are represented by
1760 one-byte sequences which are their 8-bit code.
1761
1762 The other characters are represented by a sequence of `base
1763 leading-code', optional `extended leading-code', and one or two
1764 `position-code's. The length of the sequence is determined by the
1765 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1766 whereas extended leading-code and position-code take the range 0xA0
1767 through 0xFF. See `charset.h' for more details about leading-code
1768 and position-code.
1769
1770 --- CODE RANGE of Emacs' internal format ---
1771 character set range
1772 ------------- -----
1773 ascii 0x00..0x7F
1774 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1775 eight-bit-graphic 0xA0..0xBF
1776 ELSE 0x81..0x9D + [0xA0..0xFF]+
1777 ---------------------------------------------
1778
1779 As this is the internal character representation, the format is
1780 usually not used externally (i.e. in a file or in a data sent to a
1781 process). But, it is possible to have a text externally in this
1782 format (i.e. by encoding by the coding system `emacs-mule').
1783
1784 In that case, a sequence of one-byte codes has a slightly different
1785 form.
1786
1787 At first, all characters in eight-bit-control are represented by
1788 one-byte sequences which are their 8-bit code.
1789
1790 Next, character composition data are represented by the byte
1791 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1792 where,
1793 METHOD is 0xF0 plus one of composition method (enum
1794 composition_method),
1795
1796 BYTES is 0xA0 plus a byte length of this composition data,
1797
1798 CHARS is 0x20 plus a number of characters composed by this
1799 data,
1800
1801 COMPONENTs are characters of multibye form or composition
1802 rules encoded by two-byte of ASCII codes.
1803
1804 In addition, for backward compatibility, the following formats are
1805 also recognized as composition data on decoding.
1806
1807 0x80 MSEQ ...
1808 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1809
1810 Here,
1811 MSEQ is a multibyte form but in these special format:
1812 ASCII: 0xA0 ASCII_CODE+0x80,
1813 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1814 RULE is a one byte code of the range 0xA0..0xF0 that
1815 represents a composition rule.
1816 */
1817
1818 char emacs_mule_bytes[256];
1819
1820 int
1821 emacs_mule_char (coding, src, nbytes, nchars, id)
1822 struct coding_system *coding;
1823 const unsigned char *src;
1824 int *nbytes, *nchars, *id;
1825 {
1826 const unsigned char *src_end = coding->source + coding->src_bytes;
1827 const unsigned char *src_base = src;
1828 int multibytep = coding->src_multibyte;
1829 struct charset *charset;
1830 unsigned code;
1831 int c;
1832 int consumed_chars = 0;
1833
1834 ONE_MORE_BYTE (c);
1835 if (c < 0)
1836 {
1837 c = -c;
1838 charset = emacs_mule_charset[0];
1839 }
1840 else
1841 {
1842 if (c >= 0xA0)
1843 {
1844 /* Old style component character of a composition. */
1845 if (c == 0xA0)
1846 {
1847 ONE_MORE_BYTE (c);
1848 c -= 0x80;
1849 }
1850 else
1851 c -= 0x20;
1852 }
1853
1854 switch (emacs_mule_bytes[c])
1855 {
1856 case 2:
1857 if (! (charset = emacs_mule_charset[c]))
1858 goto invalid_code;
1859 ONE_MORE_BYTE (c);
1860 if (c < 0xA0)
1861 goto invalid_code;
1862 code = c & 0x7F;
1863 break;
1864
1865 case 3:
1866 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1867 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1868 {
1869 ONE_MORE_BYTE (c);
1870 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1871 goto invalid_code;
1872 ONE_MORE_BYTE (c);
1873 if (c < 0xA0)
1874 goto invalid_code;
1875 code = c & 0x7F;
1876 }
1877 else
1878 {
1879 if (! (charset = emacs_mule_charset[c]))
1880 goto invalid_code;
1881 ONE_MORE_BYTE (c);
1882 if (c < 0xA0)
1883 goto invalid_code;
1884 code = (c & 0x7F) << 8;
1885 ONE_MORE_BYTE (c);
1886 if (c < 0xA0)
1887 goto invalid_code;
1888 code |= c & 0x7F;
1889 }
1890 break;
1891
1892 case 4:
1893 ONE_MORE_BYTE (c);
1894 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1895 goto invalid_code;
1896 ONE_MORE_BYTE (c);
1897 if (c < 0xA0)
1898 goto invalid_code;
1899 code = (c & 0x7F) << 8;
1900 ONE_MORE_BYTE (c);
1901 if (c < 0xA0)
1902 goto invalid_code;
1903 code |= c & 0x7F;
1904 break;
1905
1906 case 1:
1907 code = c;
1908 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1909 ? charset_ascii : charset_eight_bit);
1910 break;
1911
1912 default:
1913 abort ();
1914 }
1915 c = DECODE_CHAR (charset, code);
1916 if (c < 0)
1917 goto invalid_code;
1918 }
1919 *nbytes = src - src_base;
1920 *nchars = consumed_chars;
1921 if (id)
1922 *id = charset->id;
1923 return c;
1924
1925 no_more_source:
1926 return -2;
1927
1928 invalid_code:
1929 return -1;
1930 }
1931
1932
1933 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1934 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1935 else return 0. */
1936
1937 static int
1938 detect_coding_emacs_mule (coding, detect_info)
1939 struct coding_system *coding;
1940 struct coding_detection_info *detect_info;
1941 {
1942 const unsigned char *src = coding->source, *src_base;
1943 const unsigned char *src_end = coding->source + coding->src_bytes;
1944 int multibytep = coding->src_multibyte;
1945 int consumed_chars = 0;
1946 int c;
1947 int found = 0;
1948
1949 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1950 /* A coding system of this category is always ASCII compatible. */
1951 src += coding->head_ascii;
1952
1953 while (1)
1954 {
1955 src_base = src;
1956 ONE_MORE_BYTE (c);
1957 if (c < 0)
1958 continue;
1959 if (c == 0x80)
1960 {
1961 /* Perhaps the start of composite character. We simple skip
1962 it because analyzing it is too heavy for detecting. But,
1963 at least, we check that the composite character
1964 constitutes of more than 4 bytes. */
1965 const unsigned char *src_base;
1966
1967 repeat:
1968 src_base = src;
1969 do
1970 {
1971 ONE_MORE_BYTE (c);
1972 }
1973 while (c >= 0xA0);
1974
1975 if (src - src_base <= 4)
1976 break;
1977 found = CATEGORY_MASK_EMACS_MULE;
1978 if (c == 0x80)
1979 goto repeat;
1980 }
1981
1982 if (c < 0x80)
1983 {
1984 if (c < 0x20
1985 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1986 break;
1987 }
1988 else
1989 {
1990 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1991
1992 while (more_bytes > 0)
1993 {
1994 ONE_MORE_BYTE (c);
1995 if (c < 0xA0)
1996 {
1997 src--; /* Unread the last byte. */
1998 break;
1999 }
2000 more_bytes--;
2001 }
2002 if (more_bytes != 0)
2003 break;
2004 found = CATEGORY_MASK_EMACS_MULE;
2005 }
2006 }
2007 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2008 return 0;
2009
2010 no_more_source:
2011 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2012 {
2013 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2014 return 0;
2015 }
2016 detect_info->found |= found;
2017 return 1;
2018 }
2019
2020
2021 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2022
2023 /* Decode a character represented as a component of composition
2024 sequence of Emacs 20/21 style at SRC. Set C to that character and
2025 update SRC to the head of next character (or an encoded composition
2026 rule). If SRC doesn't points a composition component, set C to -1.
2027 If SRC points an invalid byte sequence, global exit by a return
2028 value 0. */
2029
2030 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
2031 do \
2032 { \
2033 int c; \
2034 int nbytes, nchars; \
2035 \
2036 if (src == src_end) \
2037 break; \
2038 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2039 if (c < 0) \
2040 { \
2041 if (c == -2) \
2042 break; \
2043 goto invalid_code; \
2044 } \
2045 *buf++ = c; \
2046 src += nbytes; \
2047 consumed_chars += nchars; \
2048 } \
2049 while (0)
2050
2051
2052 /* Decode a composition rule represented as a component of composition
2053 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2054 and increment BUF. If SRC points an invalid byte sequence, set C
2055 to -1. */
2056
2057 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
2058 do { \
2059 int c, gref, nref; \
2060 \
2061 if (src >= src_end) \
2062 goto invalid_code; \
2063 ONE_MORE_BYTE_NO_CHECK (c); \
2064 c -= 0xA0; \
2065 if (c < 0 || c >= 81) \
2066 goto invalid_code; \
2067 \
2068 gref = c / 9, nref = c % 9; \
2069 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2070 } while (0)
2071
2072
2073 /* Decode a composition rule represented as a component of composition
2074 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2075 and increment BUF. If SRC points an invalid byte sequence, set C
2076 to -1. */
2077
2078 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2079 do { \
2080 int gref, nref; \
2081 \
2082 if (src + 1>= src_end) \
2083 goto invalid_code; \
2084 ONE_MORE_BYTE_NO_CHECK (gref); \
2085 gref -= 0x20; \
2086 ONE_MORE_BYTE_NO_CHECK (nref); \
2087 nref -= 0x20; \
2088 if (gref < 0 || gref >= 81 \
2089 || nref < 0 || nref >= 81) \
2090 goto invalid_code; \
2091 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2092 } while (0)
2093
2094
2095 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
2096 do { \
2097 /* Emacs 21 style format. The first three bytes at SRC are \
2098 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
2099 the byte length of this composition information, CHARS is the \
2100 number of characters composed by this composition. */ \
2101 enum composition_method method = c - 0xF2; \
2102 int *charbuf_base = charbuf; \
2103 int consumed_chars_limit; \
2104 int nbytes, nchars; \
2105 \
2106 ONE_MORE_BYTE (c); \
2107 if (c < 0) \
2108 goto invalid_code; \
2109 nbytes = c - 0xA0; \
2110 if (nbytes < 3) \
2111 goto invalid_code; \
2112 ONE_MORE_BYTE (c); \
2113 if (c < 0) \
2114 goto invalid_code; \
2115 nchars = c - 0xA0; \
2116 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2117 consumed_chars_limit = consumed_chars_base + nbytes; \
2118 if (method != COMPOSITION_RELATIVE) \
2119 { \
2120 int i = 0; \
2121 while (consumed_chars < consumed_chars_limit) \
2122 { \
2123 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
2124 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
2125 else \
2126 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
2127 i++; \
2128 } \
2129 if (consumed_chars < consumed_chars_limit) \
2130 goto invalid_code; \
2131 charbuf_base[0] -= i; \
2132 } \
2133 } while (0)
2134
2135
2136 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2137 do { \
2138 /* Emacs 20 style format for relative composition. */ \
2139 /* Store multibyte form of characters to be composed. */ \
2140 enum composition_method method = COMPOSITION_RELATIVE; \
2141 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2142 int *buf = components; \
2143 int i, j; \
2144 \
2145 src = src_base; \
2146 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2147 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2148 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2149 if (i < 2) \
2150 goto invalid_code; \
2151 ADD_COMPOSITION_DATA (charbuf, i, method); \
2152 for (j = 0; j < i; j++) \
2153 *charbuf++ = components[j]; \
2154 } while (0)
2155
2156
2157 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2158 do { \
2159 /* Emacs 20 style format for rule-base composition. */ \
2160 /* Store multibyte form of characters to be composed. */ \
2161 enum composition_method method = COMPOSITION_WITH_RULE; \
2162 int *charbuf_base = charbuf; \
2163 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2164 int *buf = components; \
2165 int i, j; \
2166 \
2167 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2168 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2169 { \
2170 if (*src < 0xA0) \
2171 break; \
2172 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2173 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2174 } \
2175 if (i <= 1 || (buf - components) % 2 == 0) \
2176 goto invalid_code; \
2177 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2178 goto no_more_source; \
2179 ADD_COMPOSITION_DATA (charbuf, i, method); \
2180 i = i * 2 - 1; \
2181 for (j = 0; j < i; j++) \
2182 *charbuf++ = components[j]; \
2183 charbuf_base[0] -= i; \
2184 for (j = 0; j < i; j += 2) \
2185 *charbuf++ = components[j]; \
2186 } while (0)
2187
2188
2189 static void
2190 decode_coding_emacs_mule (coding)
2191 struct coding_system *coding;
2192 {
2193 const unsigned char *src = coding->source + coding->consumed;
2194 const unsigned char *src_end = coding->source + coding->src_bytes;
2195 const unsigned char *src_base;
2196 int *charbuf = coding->charbuf + coding->charbuf_used;
2197 int *charbuf_end
2198 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2199 int consumed_chars = 0, consumed_chars_base;
2200 int multibytep = coding->src_multibyte;
2201 Lisp_Object attrs, charset_list;
2202 int char_offset = coding->produced_char;
2203 int last_offset = char_offset;
2204 int last_id = charset_ascii;
2205 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2206 int byte_after_cr = -1;
2207
2208 CODING_GET_INFO (coding, attrs, charset_list);
2209
2210 while (1)
2211 {
2212 int c;
2213
2214 src_base = src;
2215 consumed_chars_base = consumed_chars;
2216
2217 if (charbuf >= charbuf_end)
2218 break;
2219
2220 if (byte_after_cr >= 0)
2221 c = byte_after_cr, byte_after_cr = -1;
2222 else
2223 ONE_MORE_BYTE (c);
2224 if (c < 0)
2225 {
2226 *charbuf++ = -c;
2227 char_offset++;
2228 }
2229 else if (c < 0x80)
2230 {
2231 if (eol_crlf && c == '\r')
2232 ONE_MORE_BYTE (byte_after_cr);
2233 *charbuf++ = c;
2234 char_offset++;
2235 }
2236 else if (c == 0x80)
2237 {
2238 ONE_MORE_BYTE (c);
2239 if (c < 0)
2240 goto invalid_code;
2241 if (c - 0xF2 >= COMPOSITION_RELATIVE
2242 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2243 DECODE_EMACS_MULE_21_COMPOSITION (c);
2244 else if (c < 0xC0)
2245 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2246 else if (c == 0xFF)
2247 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2248 else
2249 goto invalid_code;
2250 }
2251 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2252 {
2253 int nbytes, nchars;
2254 int id;
2255
2256 src = src_base;
2257 consumed_chars = consumed_chars_base;
2258 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2259 if (c < 0)
2260 {
2261 if (c == -2)
2262 break;
2263 goto invalid_code;
2264 }
2265 if (last_id != id)
2266 {
2267 if (last_id != charset_ascii)
2268 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2269 last_id = id;
2270 last_offset = char_offset;
2271 }
2272 *charbuf++ = c;
2273 src += nbytes;
2274 consumed_chars += nchars;
2275 char_offset++;
2276 }
2277 else
2278 goto invalid_code;
2279 continue;
2280
2281 invalid_code:
2282 src = src_base;
2283 consumed_chars = consumed_chars_base;
2284 ONE_MORE_BYTE (c);
2285 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2286 char_offset++;
2287 coding->errors++;
2288 }
2289
2290 no_more_source:
2291 if (last_id != charset_ascii)
2292 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2293 coding->consumed_char += consumed_chars_base;
2294 coding->consumed = src_base - coding->source;
2295 coding->charbuf_used = charbuf - coding->charbuf;
2296 }
2297
2298
2299 #define EMACS_MULE_LEADING_CODES(id, codes) \
2300 do { \
2301 if (id < 0xA0) \
2302 codes[0] = id, codes[1] = 0; \
2303 else if (id < 0xE0) \
2304 codes[0] = 0x9A, codes[1] = id; \
2305 else if (id < 0xF0) \
2306 codes[0] = 0x9B, codes[1] = id; \
2307 else if (id < 0xF5) \
2308 codes[0] = 0x9C, codes[1] = id; \
2309 else \
2310 codes[0] = 0x9D, codes[1] = id; \
2311 } while (0);
2312
2313
2314 static int
2315 encode_coding_emacs_mule (coding)
2316 struct coding_system *coding;
2317 {
2318 int multibytep = coding->dst_multibyte;
2319 int *charbuf = coding->charbuf;
2320 int *charbuf_end = charbuf + coding->charbuf_used;
2321 unsigned char *dst = coding->destination + coding->produced;
2322 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2323 int safe_room = 8;
2324 int produced_chars = 0;
2325 Lisp_Object attrs, charset_list;
2326 int c;
2327 int preferred_charset_id = -1;
2328
2329 CODING_GET_INFO (coding, attrs, charset_list);
2330 if (! EQ (charset_list, Vemacs_mule_charset_list))
2331 {
2332 CODING_ATTR_CHARSET_LIST (attrs)
2333 = charset_list = Vemacs_mule_charset_list;
2334 }
2335
2336 while (charbuf < charbuf_end)
2337 {
2338 ASSURE_DESTINATION (safe_room);
2339 c = *charbuf++;
2340
2341 if (c < 0)
2342 {
2343 /* Handle an annotation. */
2344 switch (*charbuf)
2345 {
2346 case CODING_ANNOTATE_COMPOSITION_MASK:
2347 /* Not yet implemented. */
2348 break;
2349 case CODING_ANNOTATE_CHARSET_MASK:
2350 preferred_charset_id = charbuf[3];
2351 if (preferred_charset_id >= 0
2352 && NILP (Fmemq (make_number (preferred_charset_id),
2353 charset_list)))
2354 preferred_charset_id = -1;
2355 break;
2356 default:
2357 abort ();
2358 }
2359 charbuf += -c - 1;
2360 continue;
2361 }
2362
2363 if (ASCII_CHAR_P (c))
2364 EMIT_ONE_ASCII_BYTE (c);
2365 else if (CHAR_BYTE8_P (c))
2366 {
2367 c = CHAR_TO_BYTE8 (c);
2368 EMIT_ONE_BYTE (c);
2369 }
2370 else
2371 {
2372 struct charset *charset;
2373 unsigned code;
2374 int dimension;
2375 int emacs_mule_id;
2376 unsigned char leading_codes[2];
2377
2378 if (preferred_charset_id >= 0)
2379 {
2380 charset = CHARSET_FROM_ID (preferred_charset_id);
2381 if (! CHAR_CHARSET_P (c, charset))
2382 charset = char_charset (c, charset_list, NULL);
2383 }
2384 else
2385 charset = char_charset (c, charset_list, &code);
2386 if (! charset)
2387 {
2388 c = coding->default_char;
2389 if (ASCII_CHAR_P (c))
2390 {
2391 EMIT_ONE_ASCII_BYTE (c);
2392 continue;
2393 }
2394 charset = char_charset (c, charset_list, &code);
2395 }
2396 dimension = CHARSET_DIMENSION (charset);
2397 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2398 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2399 EMIT_ONE_BYTE (leading_codes[0]);
2400 if (leading_codes[1])
2401 EMIT_ONE_BYTE (leading_codes[1]);
2402 if (dimension == 1)
2403 EMIT_ONE_BYTE (code | 0x80);
2404 else
2405 {
2406 code |= 0x8080;
2407 EMIT_ONE_BYTE (code >> 8);
2408 EMIT_ONE_BYTE (code & 0xFF);
2409 }
2410 }
2411 }
2412 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2413 coding->produced_char += produced_chars;
2414 coding->produced = dst - coding->destination;
2415 return 0;
2416 }
2417
2418 \f
2419 /*** 7. ISO2022 handlers ***/
2420
2421 /* The following note describes the coding system ISO2022 briefly.
2422 Since the intention of this note is to help understand the
2423 functions in this file, some parts are NOT ACCURATE or are OVERLY
2424 SIMPLIFIED. For thorough understanding, please refer to the
2425 original document of ISO2022. This is equivalent to the standard
2426 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2427
2428 ISO2022 provides many mechanisms to encode several character sets
2429 in 7-bit and 8-bit environments. For 7-bit environments, all text
2430 is encoded using bytes less than 128. This may make the encoded
2431 text a little bit longer, but the text passes more easily through
2432 several types of gateway, some of which strip off the MSB (Most
2433 Significant Bit).
2434
2435 There are two kinds of character sets: control character sets and
2436 graphic character sets. The former contain control characters such
2437 as `newline' and `escape' to provide control functions (control
2438 functions are also provided by escape sequences). The latter
2439 contain graphic characters such as 'A' and '-'. Emacs recognizes
2440 two control character sets and many graphic character sets.
2441
2442 Graphic character sets are classified into one of the following
2443 four classes, according to the number of bytes (DIMENSION) and
2444 number of characters in one dimension (CHARS) of the set:
2445 - DIMENSION1_CHARS94
2446 - DIMENSION1_CHARS96
2447 - DIMENSION2_CHARS94
2448 - DIMENSION2_CHARS96
2449
2450 In addition, each character set is assigned an identification tag,
2451 unique for each set, called the "final character" (denoted as <F>
2452 hereafter). The <F> of each character set is decided by ECMA(*)
2453 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2454 (0x30..0x3F are for private use only).
2455
2456 Note (*): ECMA = European Computer Manufacturers Association
2457
2458 Here are examples of graphic character sets [NAME(<F>)]:
2459 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2460 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2461 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2462 o DIMENSION2_CHARS96 -- none for the moment
2463
2464 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2465 C0 [0x00..0x1F] -- control character plane 0
2466 GL [0x20..0x7F] -- graphic character plane 0
2467 C1 [0x80..0x9F] -- control character plane 1
2468 GR [0xA0..0xFF] -- graphic character plane 1
2469
2470 A control character set is directly designated and invoked to C0 or
2471 C1 by an escape sequence. The most common case is that:
2472 - ISO646's control character set is designated/invoked to C0, and
2473 - ISO6429's control character set is designated/invoked to C1,
2474 and usually these designations/invocations are omitted in encoded
2475 text. In a 7-bit environment, only C0 can be used, and a control
2476 character for C1 is encoded by an appropriate escape sequence to
2477 fit into the environment. All control characters for C1 are
2478 defined to have corresponding escape sequences.
2479
2480 A graphic character set is at first designated to one of four
2481 graphic registers (G0 through G3), then these graphic registers are
2482 invoked to GL or GR. These designations and invocations can be
2483 done independently. The most common case is that G0 is invoked to
2484 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2485 these invocations and designations are omitted in encoded text.
2486 In a 7-bit environment, only GL can be used.
2487
2488 When a graphic character set of CHARS94 is invoked to GL, codes
2489 0x20 and 0x7F of the GL area work as control characters SPACE and
2490 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2491 be used.
2492
2493 There are two ways of invocation: locking-shift and single-shift.
2494 With locking-shift, the invocation lasts until the next different
2495 invocation, whereas with single-shift, the invocation affects the
2496 following character only and doesn't affect the locking-shift
2497 state. Invocations are done by the following control characters or
2498 escape sequences:
2499
2500 ----------------------------------------------------------------------
2501 abbrev function cntrl escape seq description
2502 ----------------------------------------------------------------------
2503 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2504 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2505 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2506 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2507 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2508 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2509 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2510 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2511 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2512 ----------------------------------------------------------------------
2513 (*) These are not used by any known coding system.
2514
2515 Control characters for these functions are defined by macros
2516 ISO_CODE_XXX in `coding.h'.
2517
2518 Designations are done by the following escape sequences:
2519 ----------------------------------------------------------------------
2520 escape sequence description
2521 ----------------------------------------------------------------------
2522 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2523 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2524 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2525 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2526 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2527 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2528 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2529 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2530 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2531 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2532 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2533 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2534 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2535 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2536 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2537 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2538 ----------------------------------------------------------------------
2539
2540 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2541 of dimension 1, chars 94, and final character <F>, etc...
2542
2543 Note (*): Although these designations are not allowed in ISO2022,
2544 Emacs accepts them on decoding, and produces them on encoding
2545 CHARS96 character sets in a coding system which is characterized as
2546 7-bit environment, non-locking-shift, and non-single-shift.
2547
2548 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2549 '(' must be omitted. We refer to this as "short-form" hereafter.
2550
2551 Now you may notice that there are a lot of ways of encoding the
2552 same multilingual text in ISO2022. Actually, there exist many
2553 coding systems such as Compound Text (used in X11's inter client
2554 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2555 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2556 localized platforms), and all of these are variants of ISO2022.
2557
2558 In addition to the above, Emacs handles two more kinds of escape
2559 sequences: ISO6429's direction specification and Emacs' private
2560 sequence for specifying character composition.
2561
2562 ISO6429's direction specification takes the following form:
2563 o CSI ']' -- end of the current direction
2564 o CSI '0' ']' -- end of the current direction
2565 o CSI '1' ']' -- start of left-to-right text
2566 o CSI '2' ']' -- start of right-to-left text
2567 The control character CSI (0x9B: control sequence introducer) is
2568 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2569
2570 Character composition specification takes the following form:
2571 o ESC '0' -- start relative composition
2572 o ESC '1' -- end composition
2573 o ESC '2' -- start rule-base composition (*)
2574 o ESC '3' -- start relative composition with alternate chars (**)
2575 o ESC '4' -- start rule-base composition with alternate chars (**)
2576 Since these are not standard escape sequences of any ISO standard,
2577 the use of them with these meanings is restricted to Emacs only.
2578
2579 (*) This form is used only in Emacs 20.7 and older versions,
2580 but newer versions can safely decode it.
2581 (**) This form is used only in Emacs 21.1 and newer versions,
2582 and older versions can't decode it.
2583
2584 Here's a list of example usages of these composition escape
2585 sequences (categorized by `enum composition_method').
2586
2587 COMPOSITION_RELATIVE:
2588 ESC 0 CHAR [ CHAR ] ESC 1
2589 COMPOSITION_WITH_RULE:
2590 ESC 2 CHAR [ RULE CHAR ] ESC 1
2591 COMPOSITION_WITH_ALTCHARS:
2592 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2593 COMPOSITION_WITH_RULE_ALTCHARS:
2594 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2595
2596 enum iso_code_class_type iso_code_class[256];
2597
2598 #define SAFE_CHARSET_P(coding, id) \
2599 ((id) <= (coding)->max_charset_id \
2600 && (coding)->safe_charsets[id] >= 0)
2601
2602
2603 #define SHIFT_OUT_OK(category) \
2604 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2605
2606 static void
2607 setup_iso_safe_charsets (attrs)
2608 Lisp_Object attrs;
2609 {
2610 Lisp_Object charset_list, safe_charsets;
2611 Lisp_Object request;
2612 Lisp_Object reg_usage;
2613 Lisp_Object tail;
2614 int reg94, reg96;
2615 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2616 int max_charset_id;
2617
2618 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2619 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2620 && ! EQ (charset_list, Viso_2022_charset_list))
2621 {
2622 CODING_ATTR_CHARSET_LIST (attrs)
2623 = charset_list = Viso_2022_charset_list;
2624 ASET (attrs, coding_attr_safe_charsets, Qnil);
2625 }
2626
2627 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2628 return;
2629
2630 max_charset_id = 0;
2631 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2632 {
2633 int id = XINT (XCAR (tail));
2634 if (max_charset_id < id)
2635 max_charset_id = id;
2636 }
2637
2638 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2639 make_number (255));
2640 request = AREF (attrs, coding_attr_iso_request);
2641 reg_usage = AREF (attrs, coding_attr_iso_usage);
2642 reg94 = XINT (XCAR (reg_usage));
2643 reg96 = XINT (XCDR (reg_usage));
2644
2645 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2646 {
2647 Lisp_Object id;
2648 Lisp_Object reg;
2649 struct charset *charset;
2650
2651 id = XCAR (tail);
2652 charset = CHARSET_FROM_ID (XINT (id));
2653 reg = Fcdr (Fassq (id, request));
2654 if (! NILP (reg))
2655 SSET (safe_charsets, XINT (id), XINT (reg));
2656 else if (charset->iso_chars_96)
2657 {
2658 if (reg96 < 4)
2659 SSET (safe_charsets, XINT (id), reg96);
2660 }
2661 else
2662 {
2663 if (reg94 < 4)
2664 SSET (safe_charsets, XINT (id), reg94);
2665 }
2666 }
2667 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2668 }
2669
2670
2671 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2672 Check if a text is encoded in one of ISO-2022 based codig systems.
2673 If it is, return 1, else return 0. */
2674
2675 static int
2676 detect_coding_iso_2022 (coding, detect_info)
2677 struct coding_system *coding;
2678 struct coding_detection_info *detect_info;
2679 {
2680 const unsigned char *src = coding->source, *src_base = src;
2681 const unsigned char *src_end = coding->source + coding->src_bytes;
2682 int multibytep = coding->src_multibyte;
2683 int single_shifting = 0;
2684 int id;
2685 int c, c1;
2686 int consumed_chars = 0;
2687 int i;
2688 int rejected = 0;
2689 int found = 0;
2690
2691 detect_info->checked |= CATEGORY_MASK_ISO;
2692
2693 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2694 {
2695 struct coding_system *this = &(coding_categories[i]);
2696 Lisp_Object attrs, val;
2697
2698 attrs = CODING_ID_ATTRS (this->id);
2699 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2700 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2701 setup_iso_safe_charsets (attrs);
2702 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2703 this->max_charset_id = SCHARS (val) - 1;
2704 this->safe_charsets = (char *) SDATA (val);
2705 }
2706
2707 /* A coding system of this category is always ASCII compatible. */
2708 src += coding->head_ascii;
2709
2710 while (rejected != CATEGORY_MASK_ISO)
2711 {
2712 src_base = src;
2713 ONE_MORE_BYTE (c);
2714 switch (c)
2715 {
2716 case ISO_CODE_ESC:
2717 if (inhibit_iso_escape_detection)
2718 break;
2719 single_shifting = 0;
2720 ONE_MORE_BYTE (c);
2721 if (c >= '(' && c <= '/')
2722 {
2723 /* Designation sequence for a charset of dimension 1. */
2724 ONE_MORE_BYTE (c1);
2725 if (c1 < ' ' || c1 >= 0x80
2726 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2727 /* Invalid designation sequence. Just ignore. */
2728 break;
2729 }
2730 else if (c == '$')
2731 {
2732 /* Designation sequence for a charset of dimension 2. */
2733 ONE_MORE_BYTE (c);
2734 if (c >= '@' && c <= 'B')
2735 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2736 id = iso_charset_table[1][0][c];
2737 else if (c >= '(' && c <= '/')
2738 {
2739 ONE_MORE_BYTE (c1);
2740 if (c1 < ' ' || c1 >= 0x80
2741 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2742 /* Invalid designation sequence. Just ignore. */
2743 break;
2744 }
2745 else
2746 /* Invalid designation sequence. Just ignore it. */
2747 break;
2748 }
2749 else if (c == 'N' || c == 'O')
2750 {
2751 /* ESC <Fe> for SS2 or SS3. */
2752 single_shifting = 1;
2753 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2754 break;
2755 }
2756 else if (c >= '0' && c <= '4')
2757 {
2758 /* ESC <Fp> for start/end composition. */
2759 found |= CATEGORY_MASK_ISO;
2760 break;
2761 }
2762 else
2763 {
2764 /* Invalid escape sequence. Just ignore it. */
2765 break;
2766 }
2767
2768 /* We found a valid designation sequence for CHARSET. */
2769 rejected |= CATEGORY_MASK_ISO_8BIT;
2770 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2771 id))
2772 found |= CATEGORY_MASK_ISO_7;
2773 else
2774 rejected |= CATEGORY_MASK_ISO_7;
2775 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2776 id))
2777 found |= CATEGORY_MASK_ISO_7_TIGHT;
2778 else
2779 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2780 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2781 id))
2782 found |= CATEGORY_MASK_ISO_7_ELSE;
2783 else
2784 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2785 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2786 id))
2787 found |= CATEGORY_MASK_ISO_8_ELSE;
2788 else
2789 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2790 break;
2791
2792 case ISO_CODE_SO:
2793 case ISO_CODE_SI:
2794 /* Locking shift out/in. */
2795 if (inhibit_iso_escape_detection)
2796 break;
2797 single_shifting = 0;
2798 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2799 break;
2800
2801 case ISO_CODE_CSI:
2802 /* Control sequence introducer. */
2803 single_shifting = 0;
2804 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2805 found |= CATEGORY_MASK_ISO_8_ELSE;
2806 goto check_extra_latin;
2807
2808 case ISO_CODE_SS2:
2809 case ISO_CODE_SS3:
2810 /* Single shift. */
2811 if (inhibit_iso_escape_detection)
2812 break;
2813 single_shifting = 0;
2814 rejected |= CATEGORY_MASK_ISO_7BIT;
2815 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2816 & CODING_ISO_FLAG_SINGLE_SHIFT)
2817 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2818 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2819 & CODING_ISO_FLAG_SINGLE_SHIFT)
2820 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2821 if (single_shifting)
2822 break;
2823 goto check_extra_latin;
2824
2825 default:
2826 if (c < 0)
2827 continue;
2828 if (c < 0x80)
2829 {
2830 single_shifting = 0;
2831 break;
2832 }
2833 if (c >= 0xA0)
2834 {
2835 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2836 found |= CATEGORY_MASK_ISO_8_1;
2837 /* Check the length of succeeding codes of the range
2838 0xA0..0FF. If the byte length is even, we include
2839 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2840 only when we are not single shifting. */
2841 if (! single_shifting
2842 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2843 {
2844 int i = 1;
2845 while (src < src_end)
2846 {
2847 ONE_MORE_BYTE (c);
2848 if (c < 0xA0)
2849 break;
2850 i++;
2851 }
2852
2853 if (i & 1 && src < src_end)
2854 rejected |= CATEGORY_MASK_ISO_8_2;
2855 else
2856 found |= CATEGORY_MASK_ISO_8_2;
2857 }
2858 break;
2859 }
2860 check_extra_latin:
2861 single_shifting = 0;
2862 if (! VECTORP (Vlatin_extra_code_table)
2863 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2864 {
2865 rejected = CATEGORY_MASK_ISO;
2866 break;
2867 }
2868 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2869 & CODING_ISO_FLAG_LATIN_EXTRA)
2870 found |= CATEGORY_MASK_ISO_8_1;
2871 else
2872 rejected |= CATEGORY_MASK_ISO_8_1;
2873 rejected |= CATEGORY_MASK_ISO_8_2;
2874 }
2875 }
2876 detect_info->rejected |= CATEGORY_MASK_ISO;
2877 return 0;
2878
2879 no_more_source:
2880 detect_info->rejected |= rejected;
2881 detect_info->found |= (found & ~rejected);
2882 return 1;
2883 }
2884
2885
2886 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2887 escape sequence should be kept. */
2888 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2889 do { \
2890 int id, prev; \
2891 \
2892 if (final < '0' || final >= 128 \
2893 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2894 || !SAFE_CHARSET_P (coding, id)) \
2895 { \
2896 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2897 chars_96 = -1; \
2898 break; \
2899 } \
2900 prev = CODING_ISO_DESIGNATION (coding, reg); \
2901 if (id == charset_jisx0201_roman) \
2902 { \
2903 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2904 id = charset_ascii; \
2905 } \
2906 else if (id == charset_jisx0208_1978) \
2907 { \
2908 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2909 id = charset_jisx0208; \
2910 } \
2911 CODING_ISO_DESIGNATION (coding, reg) = id; \
2912 /* If there was an invalid designation to REG previously, and this \
2913 designation is ASCII to REG, we should keep this designation \
2914 sequence. */ \
2915 if (prev == -2 && id == charset_ascii) \
2916 chars_96 = -1; \
2917 } while (0)
2918
2919
2920 #define MAYBE_FINISH_COMPOSITION() \
2921 do { \
2922 int i; \
2923 if (composition_state == COMPOSING_NO) \
2924 break; \
2925 /* It is assured that we have enough room for producing \
2926 characters stored in the table `components'. */ \
2927 if (charbuf + component_idx > charbuf_end) \
2928 goto no_more_source; \
2929 composition_state = COMPOSING_NO; \
2930 if (method == COMPOSITION_RELATIVE \
2931 || method == COMPOSITION_WITH_ALTCHARS) \
2932 { \
2933 for (i = 0; i < component_idx; i++) \
2934 *charbuf++ = components[i]; \
2935 char_offset += component_idx; \
2936 } \
2937 else \
2938 { \
2939 for (i = 0; i < component_idx; i += 2) \
2940 *charbuf++ = components[i]; \
2941 char_offset += (component_idx / 2) + 1; \
2942 } \
2943 } while (0)
2944
2945
2946 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2947 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2948 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2949 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2950 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2951 */
2952
2953 #define DECODE_COMPOSITION_START(c1) \
2954 do { \
2955 if (c1 == '0' \
2956 && composition_state == COMPOSING_COMPONENT_RULE) \
2957 { \
2958 component_len = component_idx; \
2959 composition_state = COMPOSING_CHAR; \
2960 } \
2961 else \
2962 { \
2963 const unsigned char *p; \
2964 \
2965 MAYBE_FINISH_COMPOSITION (); \
2966 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2967 goto no_more_source; \
2968 for (p = src; p < src_end - 1; p++) \
2969 if (*p == ISO_CODE_ESC && p[1] == '1') \
2970 break; \
2971 if (p == src_end - 1) \
2972 { \
2973 /* The current composition doesn't end in the current \
2974 source. */ \
2975 record_conversion_result \
2976 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2977 goto no_more_source; \
2978 } \
2979 \
2980 /* This is surely the start of a composition. */ \
2981 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2982 : c1 == '2' ? COMPOSITION_WITH_RULE \
2983 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2984 : COMPOSITION_WITH_RULE_ALTCHARS); \
2985 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2986 : COMPOSING_COMPONENT_CHAR); \
2987 component_idx = component_len = 0; \
2988 } \
2989 } while (0)
2990
2991
2992 /* Handle compositoin end sequence ESC 1. */
2993
2994 #define DECODE_COMPOSITION_END() \
2995 do { \
2996 int nchars = (component_len > 0 ? component_idx - component_len \
2997 : method == COMPOSITION_RELATIVE ? component_idx \
2998 : (component_idx + 1) / 2); \
2999 int i; \
3000 int *saved_charbuf = charbuf; \
3001 \
3002 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
3003 if (method != COMPOSITION_RELATIVE) \
3004 { \
3005 if (component_len == 0) \
3006 for (i = 0; i < component_idx; i++) \
3007 *charbuf++ = components[i]; \
3008 else \
3009 for (i = 0; i < component_len; i++) \
3010 *charbuf++ = components[i]; \
3011 *saved_charbuf = saved_charbuf - charbuf; \
3012 } \
3013 if (method == COMPOSITION_WITH_RULE) \
3014 for (i = 0; i < component_idx; i += 2, char_offset++) \
3015 *charbuf++ = components[i]; \
3016 else \
3017 for (i = component_len; i < component_idx; i++, char_offset++) \
3018 *charbuf++ = components[i]; \
3019 coding->annotated = 1; \
3020 composition_state = COMPOSING_NO; \
3021 } while (0)
3022
3023
3024 /* Decode a composition rule from the byte C1 (and maybe one more byte
3025 from SRC) and store one encoded composition rule in
3026 coding->cmp_data. */
3027
3028 #define DECODE_COMPOSITION_RULE(c1) \
3029 do { \
3030 (c1) -= 32; \
3031 if (c1 < 81) /* old format (before ver.21) */ \
3032 { \
3033 int gref = (c1) / 9; \
3034 int nref = (c1) % 9; \
3035 if (gref == 4) gref = 10; \
3036 if (nref == 4) nref = 10; \
3037 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
3038 } \
3039 else if (c1 < 93) /* new format (after ver.21) */ \
3040 { \
3041 ONE_MORE_BYTE (c2); \
3042 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
3043 } \
3044 else \
3045 c1 = 0; \
3046 } while (0)
3047
3048
3049 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3050
3051 static void
3052 decode_coding_iso_2022 (coding)
3053 struct coding_system *coding;
3054 {
3055 const unsigned char *src = coding->source + coding->consumed;
3056 const unsigned char *src_end = coding->source + coding->src_bytes;
3057 const unsigned char *src_base;
3058 int *charbuf = coding->charbuf + coding->charbuf_used;
3059 int *charbuf_end
3060 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3061 int consumed_chars = 0, consumed_chars_base;
3062 int multibytep = coding->src_multibyte;
3063 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3064 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3065 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3066 int charset_id_2, charset_id_3;
3067 struct charset *charset;
3068 int c;
3069 /* For handling composition sequence. */
3070 #define COMPOSING_NO 0
3071 #define COMPOSING_CHAR 1
3072 #define COMPOSING_RULE 2
3073 #define COMPOSING_COMPONENT_CHAR 3
3074 #define COMPOSING_COMPONENT_RULE 4
3075
3076 int composition_state = COMPOSING_NO;
3077 enum composition_method method;
3078 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3079 int component_idx;
3080 int component_len;
3081 Lisp_Object attrs, charset_list;
3082 int char_offset = coding->produced_char;
3083 int last_offset = char_offset;
3084 int last_id = charset_ascii;
3085 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3086 int byte_after_cr = -1;
3087
3088 CODING_GET_INFO (coding, attrs, charset_list);
3089 setup_iso_safe_charsets (attrs);
3090 /* Charset list may have been changed. */
3091 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3092 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3093
3094 while (1)
3095 {
3096 int c1, c2;
3097
3098 src_base = src;
3099 consumed_chars_base = consumed_chars;
3100
3101 if (charbuf >= charbuf_end)
3102 break;
3103
3104 if (byte_after_cr >= 0)
3105 c1 = byte_after_cr, byte_after_cr = -1;
3106 else
3107 ONE_MORE_BYTE (c1);
3108 if (c1 < 0)
3109 goto invalid_code;
3110
3111 /* We produce at most one character. */
3112 switch (iso_code_class [c1])
3113 {
3114 case ISO_0x20_or_0x7F:
3115 if (composition_state != COMPOSING_NO)
3116 {
3117 if (composition_state == COMPOSING_RULE
3118 || composition_state == COMPOSING_COMPONENT_RULE)
3119 {
3120 DECODE_COMPOSITION_RULE (c1);
3121 components[component_idx++] = c1;
3122 composition_state--;
3123 continue;
3124 }
3125 }
3126 if (charset_id_0 < 0
3127 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3128 /* This is SPACE or DEL. */
3129 charset = CHARSET_FROM_ID (charset_ascii);
3130 else
3131 charset = CHARSET_FROM_ID (charset_id_0);
3132 break;
3133
3134 case ISO_graphic_plane_0:
3135 if (composition_state != COMPOSING_NO)
3136 {
3137 if (composition_state == COMPOSING_RULE
3138 || composition_state == COMPOSING_COMPONENT_RULE)
3139 {
3140 DECODE_COMPOSITION_RULE (c1);
3141 components[component_idx++] = c1;
3142 composition_state--;
3143 continue;
3144 }
3145 }
3146 if (charset_id_0 < 0)
3147 charset = CHARSET_FROM_ID (charset_ascii);
3148 else
3149 charset = CHARSET_FROM_ID (charset_id_0);
3150 break;
3151
3152 case ISO_0xA0_or_0xFF:
3153 if (charset_id_1 < 0
3154 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3155 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3156 goto invalid_code;
3157 /* This is a graphic character, we fall down ... */
3158
3159 case ISO_graphic_plane_1:
3160 if (charset_id_1 < 0)
3161 goto invalid_code;
3162 charset = CHARSET_FROM_ID (charset_id_1);
3163 break;
3164
3165 case ISO_control_0:
3166 if (eol_crlf && c1 == '\r')
3167 ONE_MORE_BYTE (byte_after_cr);
3168 MAYBE_FINISH_COMPOSITION ();
3169 charset = CHARSET_FROM_ID (charset_ascii);
3170 break;
3171
3172 case ISO_control_1:
3173 MAYBE_FINISH_COMPOSITION ();
3174 goto invalid_code;
3175
3176 case ISO_shift_out:
3177 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3178 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3179 goto invalid_code;
3180 CODING_ISO_INVOCATION (coding, 0) = 1;
3181 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3182 continue;
3183
3184 case ISO_shift_in:
3185 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3186 goto invalid_code;
3187 CODING_ISO_INVOCATION (coding, 0) = 0;
3188 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3189 continue;
3190
3191 case ISO_single_shift_2_7:
3192 case ISO_single_shift_2:
3193 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3194 goto invalid_code;
3195 /* SS2 is handled as an escape sequence of ESC 'N' */
3196 c1 = 'N';
3197 goto label_escape_sequence;
3198
3199 case ISO_single_shift_3:
3200 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3201 goto invalid_code;
3202 /* SS2 is handled as an escape sequence of ESC 'O' */
3203 c1 = 'O';
3204 goto label_escape_sequence;
3205
3206 case ISO_control_sequence_introducer:
3207 /* CSI is handled as an escape sequence of ESC '[' ... */
3208 c1 = '[';
3209 goto label_escape_sequence;
3210
3211 case ISO_escape:
3212 ONE_MORE_BYTE (c1);
3213 label_escape_sequence:
3214 /* Escape sequences handled here are invocation,
3215 designation, direction specification, and character
3216 composition specification. */
3217 switch (c1)
3218 {
3219 case '&': /* revision of following character set */
3220 ONE_MORE_BYTE (c1);
3221 if (!(c1 >= '@' && c1 <= '~'))
3222 goto invalid_code;
3223 ONE_MORE_BYTE (c1);
3224 if (c1 != ISO_CODE_ESC)
3225 goto invalid_code;
3226 ONE_MORE_BYTE (c1);
3227 goto label_escape_sequence;
3228
3229 case '$': /* designation of 2-byte character set */
3230 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3231 goto invalid_code;
3232 {
3233 int reg, chars96;
3234
3235 ONE_MORE_BYTE (c1);
3236 if (c1 >= '@' && c1 <= 'B')
3237 { /* designation of JISX0208.1978, GB2312.1980,
3238 or JISX0208.1980 */
3239 reg = 0, chars96 = 0;
3240 }
3241 else if (c1 >= 0x28 && c1 <= 0x2B)
3242 { /* designation of DIMENSION2_CHARS94 character set */
3243 reg = c1 - 0x28, chars96 = 0;
3244 ONE_MORE_BYTE (c1);
3245 }
3246 else if (c1 >= 0x2C && c1 <= 0x2F)
3247 { /* designation of DIMENSION2_CHARS96 character set */
3248 reg = c1 - 0x2C, chars96 = 1;
3249 ONE_MORE_BYTE (c1);
3250 }
3251 else
3252 goto invalid_code;
3253 DECODE_DESIGNATION (reg, 2, chars96, c1);
3254 /* We must update these variables now. */
3255 if (reg == 0)
3256 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3257 else if (reg == 1)
3258 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3259 if (chars96 < 0)
3260 goto invalid_code;
3261 }
3262 continue;
3263
3264 case 'n': /* invocation of locking-shift-2 */
3265 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3266 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3267 goto invalid_code;
3268 CODING_ISO_INVOCATION (coding, 0) = 2;
3269 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3270 continue;
3271
3272 case 'o': /* invocation of locking-shift-3 */
3273 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3274 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3275 goto invalid_code;
3276 CODING_ISO_INVOCATION (coding, 0) = 3;
3277 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3278 continue;
3279
3280 case 'N': /* invocation of single-shift-2 */
3281 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3282 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3283 goto invalid_code;
3284 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3285 if (charset_id_2 < 0)
3286 charset = CHARSET_FROM_ID (charset_ascii);
3287 else
3288 charset = CHARSET_FROM_ID (charset_id_2);
3289 ONE_MORE_BYTE (c1);
3290 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3291 goto invalid_code;
3292 break;
3293
3294 case 'O': /* invocation of single-shift-3 */
3295 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3296 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3297 goto invalid_code;
3298 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3299 if (charset_id_3 < 0)
3300 charset = CHARSET_FROM_ID (charset_ascii);
3301 else
3302 charset = CHARSET_FROM_ID (charset_id_3);
3303 ONE_MORE_BYTE (c1);
3304 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3305 goto invalid_code;
3306 break;
3307
3308 case '0': case '2': case '3': case '4': /* start composition */
3309 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3310 goto invalid_code;
3311 DECODE_COMPOSITION_START (c1);
3312 continue;
3313
3314 case '1': /* end composition */
3315 if (composition_state == COMPOSING_NO)
3316 goto invalid_code;
3317 DECODE_COMPOSITION_END ();
3318 continue;
3319
3320 case '[': /* specification of direction */
3321 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3322 goto invalid_code;
3323 /* For the moment, nested direction is not supported.
3324 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3325 left-to-right, and nozero means right-to-left. */
3326 ONE_MORE_BYTE (c1);
3327 switch (c1)
3328 {
3329 case ']': /* end of the current direction */
3330 coding->mode &= ~CODING_MODE_DIRECTION;
3331
3332 case '0': /* end of the current direction */
3333 case '1': /* start of left-to-right direction */
3334 ONE_MORE_BYTE (c1);
3335 if (c1 == ']')
3336 coding->mode &= ~CODING_MODE_DIRECTION;
3337 else
3338 goto invalid_code;
3339 break;
3340
3341 case '2': /* start of right-to-left direction */
3342 ONE_MORE_BYTE (c1);
3343 if (c1 == ']')
3344 coding->mode |= CODING_MODE_DIRECTION;
3345 else
3346 goto invalid_code;
3347 break;
3348
3349 default:
3350 goto invalid_code;
3351 }
3352 continue;
3353
3354 case '%':
3355 ONE_MORE_BYTE (c1);
3356 if (c1 == '/')
3357 {
3358 /* CTEXT extended segment:
3359 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3360 We keep these bytes as is for the moment.
3361 They may be decoded by post-read-conversion. */
3362 int dim, M, L;
3363 int size;
3364
3365 ONE_MORE_BYTE (dim);
3366 ONE_MORE_BYTE (M);
3367 ONE_MORE_BYTE (L);
3368 size = ((M - 128) * 128) + (L - 128);
3369 if (charbuf + 8 + size > charbuf_end)
3370 goto break_loop;
3371 *charbuf++ = ISO_CODE_ESC;
3372 *charbuf++ = '%';
3373 *charbuf++ = '/';
3374 *charbuf++ = dim;
3375 *charbuf++ = BYTE8_TO_CHAR (M);
3376 *charbuf++ = BYTE8_TO_CHAR (L);
3377 while (size-- > 0)
3378 {
3379 ONE_MORE_BYTE (c1);
3380 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3381 }
3382 }
3383 else if (c1 == 'G')
3384 {
3385 /* XFree86 extension for embedding UTF-8 in CTEXT:
3386 ESC % G --UTF-8-BYTES-- ESC % @
3387 We keep these bytes as is for the moment.
3388 They may be decoded by post-read-conversion. */
3389 int *p = charbuf;
3390
3391 if (p + 6 > charbuf_end)
3392 goto break_loop;
3393 *p++ = ISO_CODE_ESC;
3394 *p++ = '%';
3395 *p++ = 'G';
3396 while (p < charbuf_end)
3397 {
3398 ONE_MORE_BYTE (c1);
3399 if (c1 == ISO_CODE_ESC
3400 && src + 1 < src_end
3401 && src[0] == '%'
3402 && src[1] == '@')
3403 {
3404 src += 2;
3405 break;
3406 }
3407 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3408 }
3409 if (p + 3 > charbuf_end)
3410 goto break_loop;
3411 *p++ = ISO_CODE_ESC;
3412 *p++ = '%';
3413 *p++ = '@';
3414 charbuf = p;
3415 }
3416 else
3417 goto invalid_code;
3418 continue;
3419 break;
3420
3421 default:
3422 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3423 goto invalid_code;
3424 {
3425 int reg, chars96;
3426
3427 if (c1 >= 0x28 && c1 <= 0x2B)
3428 { /* designation of DIMENSION1_CHARS94 character set */
3429 reg = c1 - 0x28, chars96 = 0;
3430 ONE_MORE_BYTE (c1);
3431 }
3432 else if (c1 >= 0x2C && c1 <= 0x2F)
3433 { /* designation of DIMENSION1_CHARS96 character set */
3434 reg = c1 - 0x2C, chars96 = 1;
3435 ONE_MORE_BYTE (c1);
3436 }
3437 else
3438 goto invalid_code;
3439 DECODE_DESIGNATION (reg, 1, chars96, c1);
3440 /* We must update these variables now. */
3441 if (reg == 0)
3442 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3443 else if (reg == 1)
3444 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3445 if (chars96 < 0)
3446 goto invalid_code;
3447 }
3448 continue;
3449 }
3450 }
3451
3452 if (charset->id != charset_ascii
3453 && last_id != charset->id)
3454 {
3455 if (last_id != charset_ascii)
3456 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3457 last_id = charset->id;
3458 last_offset = char_offset;
3459 }
3460
3461 /* Now we know CHARSET and 1st position code C1 of a character.
3462 Produce a decoded character while getting 2nd position code
3463 C2 if necessary. */
3464 c1 &= 0x7F;
3465 if (CHARSET_DIMENSION (charset) > 1)
3466 {
3467 ONE_MORE_BYTE (c2);
3468 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3469 /* C2 is not in a valid range. */
3470 goto invalid_code;
3471 c1 = (c1 << 8) | (c2 & 0x7F);
3472 if (CHARSET_DIMENSION (charset) > 2)
3473 {
3474 ONE_MORE_BYTE (c2);
3475 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3476 /* C2 is not in a valid range. */
3477 goto invalid_code;
3478 c1 = (c1 << 8) | (c2 & 0x7F);
3479 }
3480 }
3481
3482 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3483 if (c < 0)
3484 {
3485 MAYBE_FINISH_COMPOSITION ();
3486 for (; src_base < src; src_base++, char_offset++)
3487 {
3488 if (ASCII_BYTE_P (*src_base))
3489 *charbuf++ = *src_base;
3490 else
3491 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3492 }
3493 }
3494 else if (composition_state == COMPOSING_NO)
3495 {
3496 *charbuf++ = c;
3497 char_offset++;
3498 }
3499 else
3500 {
3501 components[component_idx++] = c;
3502 if (method == COMPOSITION_WITH_RULE
3503 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3504 && composition_state == COMPOSING_COMPONENT_CHAR))
3505 composition_state++;
3506 }
3507 continue;
3508
3509 invalid_code:
3510 MAYBE_FINISH_COMPOSITION ();
3511 src = src_base;
3512 consumed_chars = consumed_chars_base;
3513 ONE_MORE_BYTE (c);
3514 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3515 char_offset++;
3516 coding->errors++;
3517 continue;
3518
3519 break_loop:
3520 break;
3521 }
3522
3523 no_more_source:
3524 if (last_id != charset_ascii)
3525 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3526 coding->consumed_char += consumed_chars_base;
3527 coding->consumed = src_base - coding->source;
3528 coding->charbuf_used = charbuf - coding->charbuf;
3529 }
3530
3531
3532 /* ISO2022 encoding stuff. */
3533
3534 /*
3535 It is not enough to say just "ISO2022" on encoding, we have to
3536 specify more details. In Emacs, each coding system of ISO2022
3537 variant has the following specifications:
3538 1. Initial designation to G0 thru G3.
3539 2. Allows short-form designation?
3540 3. ASCII should be designated to G0 before control characters?
3541 4. ASCII should be designated to G0 at end of line?
3542 5. 7-bit environment or 8-bit environment?
3543 6. Use locking-shift?
3544 7. Use Single-shift?
3545 And the following two are only for Japanese:
3546 8. Use ASCII in place of JIS0201-1976-Roman?
3547 9. Use JISX0208-1983 in place of JISX0208-1978?
3548 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3549 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3550 details.
3551 */
3552
3553 /* Produce codes (escape sequence) for designating CHARSET to graphic
3554 register REG at DST, and increment DST. If <final-char> of CHARSET is
3555 '@', 'A', or 'B' and the coding system CODING allows, produce
3556 designation sequence of short-form. */
3557
3558 #define ENCODE_DESIGNATION(charset, reg, coding) \
3559 do { \
3560 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3561 char *intermediate_char_94 = "()*+"; \
3562 char *intermediate_char_96 = ",-./"; \
3563 int revision = -1; \
3564 int c; \
3565 \
3566 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3567 revision = CHARSET_ISO_REVISION (charset); \
3568 \
3569 if (revision >= 0) \
3570 { \
3571 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3572 EMIT_ONE_BYTE ('@' + revision); \
3573 } \
3574 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3575 if (CHARSET_DIMENSION (charset) == 1) \
3576 { \
3577 if (! CHARSET_ISO_CHARS_96 (charset)) \
3578 c = intermediate_char_94[reg]; \
3579 else \
3580 c = intermediate_char_96[reg]; \
3581 EMIT_ONE_ASCII_BYTE (c); \
3582 } \
3583 else \
3584 { \
3585 EMIT_ONE_ASCII_BYTE ('$'); \
3586 if (! CHARSET_ISO_CHARS_96 (charset)) \
3587 { \
3588 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3589 || reg != 0 \
3590 || final_char < '@' || final_char > 'B') \
3591 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3592 } \
3593 else \
3594 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3595 } \
3596 EMIT_ONE_ASCII_BYTE (final_char); \
3597 \
3598 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3599 } while (0)
3600
3601
3602 /* The following two macros produce codes (control character or escape
3603 sequence) for ISO2022 single-shift functions (single-shift-2 and
3604 single-shift-3). */
3605
3606 #define ENCODE_SINGLE_SHIFT_2 \
3607 do { \
3608 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3609 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3610 else \
3611 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3612 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3613 } while (0)
3614
3615
3616 #define ENCODE_SINGLE_SHIFT_3 \
3617 do { \
3618 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3619 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3620 else \
3621 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3622 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3623 } while (0)
3624
3625
3626 /* The following four macros produce codes (control character or
3627 escape sequence) for ISO2022 locking-shift functions (shift-in,
3628 shift-out, locking-shift-2, and locking-shift-3). */
3629
3630 #define ENCODE_SHIFT_IN \
3631 do { \
3632 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3633 CODING_ISO_INVOCATION (coding, 0) = 0; \
3634 } while (0)
3635
3636
3637 #define ENCODE_SHIFT_OUT \
3638 do { \
3639 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3640 CODING_ISO_INVOCATION (coding, 0) = 1; \
3641 } while (0)
3642
3643
3644 #define ENCODE_LOCKING_SHIFT_2 \
3645 do { \
3646 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3647 CODING_ISO_INVOCATION (coding, 0) = 2; \
3648 } while (0)
3649
3650
3651 #define ENCODE_LOCKING_SHIFT_3 \
3652 do { \
3653 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3654 CODING_ISO_INVOCATION (coding, 0) = 3; \
3655 } while (0)
3656
3657
3658 /* Produce codes for a DIMENSION1 character whose character set is
3659 CHARSET and whose position-code is C1. Designation and invocation
3660 sequences are also produced in advance if necessary. */
3661
3662 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3663 do { \
3664 int id = CHARSET_ID (charset); \
3665 \
3666 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3667 && id == charset_ascii) \
3668 { \
3669 id = charset_jisx0201_roman; \
3670 charset = CHARSET_FROM_ID (id); \
3671 } \
3672 \
3673 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3674 { \
3675 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3676 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3677 else \
3678 EMIT_ONE_BYTE (c1 | 0x80); \
3679 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3680 break; \
3681 } \
3682 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3683 { \
3684 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3685 break; \
3686 } \
3687 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3688 { \
3689 EMIT_ONE_BYTE (c1 | 0x80); \
3690 break; \
3691 } \
3692 else \
3693 /* Since CHARSET is not yet invoked to any graphic planes, we \
3694 must invoke it, or, at first, designate it to some graphic \
3695 register. Then repeat the loop to actually produce the \
3696 character. */ \
3697 dst = encode_invocation_designation (charset, coding, dst, \
3698 &produced_chars); \
3699 } while (1)
3700
3701
3702 /* Produce codes for a DIMENSION2 character whose character set is
3703 CHARSET and whose position-codes are C1 and C2. Designation and
3704 invocation codes are also produced in advance if necessary. */
3705
3706 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3707 do { \
3708 int id = CHARSET_ID (charset); \
3709 \
3710 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3711 && id == charset_jisx0208) \
3712 { \
3713 id = charset_jisx0208_1978; \
3714 charset = CHARSET_FROM_ID (id); \
3715 } \
3716 \
3717 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3718 { \
3719 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3720 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3721 else \
3722 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3723 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3724 break; \
3725 } \
3726 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3727 { \
3728 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3729 break; \
3730 } \
3731 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3732 { \
3733 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3734 break; \
3735 } \
3736 else \
3737 /* Since CHARSET is not yet invoked to any graphic planes, we \
3738 must invoke it, or, at first, designate it to some graphic \
3739 register. Then repeat the loop to actually produce the \
3740 character. */ \
3741 dst = encode_invocation_designation (charset, coding, dst, \
3742 &produced_chars); \
3743 } while (1)
3744
3745
3746 #define ENCODE_ISO_CHARACTER(charset, c) \
3747 do { \
3748 int code = ENCODE_CHAR ((charset),(c)); \
3749 \
3750 if (CHARSET_DIMENSION (charset) == 1) \
3751 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3752 else \
3753 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3754 } while (0)
3755
3756
3757 /* Produce designation and invocation codes at a place pointed by DST
3758 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3759 Return new DST. */
3760
3761 unsigned char *
3762 encode_invocation_designation (charset, coding, dst, p_nchars)
3763 struct charset *charset;
3764 struct coding_system *coding;
3765 unsigned char *dst;
3766 int *p_nchars;
3767 {
3768 int multibytep = coding->dst_multibyte;
3769 int produced_chars = *p_nchars;
3770 int reg; /* graphic register number */
3771 int id = CHARSET_ID (charset);
3772
3773 /* At first, check designations. */
3774 for (reg = 0; reg < 4; reg++)
3775 if (id == CODING_ISO_DESIGNATION (coding, reg))
3776 break;
3777
3778 if (reg >= 4)
3779 {
3780 /* CHARSET is not yet designated to any graphic registers. */
3781 /* At first check the requested designation. */
3782 reg = CODING_ISO_REQUEST (coding, id);
3783 if (reg < 0)
3784 /* Since CHARSET requests no special designation, designate it
3785 to graphic register 0. */
3786 reg = 0;
3787
3788 ENCODE_DESIGNATION (charset, reg, coding);
3789 }
3790
3791 if (CODING_ISO_INVOCATION (coding, 0) != reg
3792 && CODING_ISO_INVOCATION (coding, 1) != reg)
3793 {
3794 /* Since the graphic register REG is not invoked to any graphic
3795 planes, invoke it to graphic plane 0. */
3796 switch (reg)
3797 {
3798 case 0: /* graphic register 0 */
3799 ENCODE_SHIFT_IN;
3800 break;
3801
3802 case 1: /* graphic register 1 */
3803 ENCODE_SHIFT_OUT;
3804 break;
3805
3806 case 2: /* graphic register 2 */
3807 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3808 ENCODE_SINGLE_SHIFT_2;
3809 else
3810 ENCODE_LOCKING_SHIFT_2;
3811 break;
3812
3813 case 3: /* graphic register 3 */
3814 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3815 ENCODE_SINGLE_SHIFT_3;
3816 else
3817 ENCODE_LOCKING_SHIFT_3;
3818 break;
3819 }
3820 }
3821
3822 *p_nchars = produced_chars;
3823 return dst;
3824 }
3825
3826 /* The following three macros produce codes for indicating direction
3827 of text. */
3828 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3829 do { \
3830 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3831 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3832 else \
3833 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3834 } while (0)
3835
3836
3837 #define ENCODE_DIRECTION_R2L() \
3838 do { \
3839 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3840 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3841 } while (0)
3842
3843
3844 #define ENCODE_DIRECTION_L2R() \
3845 do { \
3846 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3847 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3848 } while (0)
3849
3850
3851 /* Produce codes for designation and invocation to reset the graphic
3852 planes and registers to initial state. */
3853 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3854 do { \
3855 int reg; \
3856 struct charset *charset; \
3857 \
3858 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3859 ENCODE_SHIFT_IN; \
3860 for (reg = 0; reg < 4; reg++) \
3861 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3862 && (CODING_ISO_DESIGNATION (coding, reg) \
3863 != CODING_ISO_INITIAL (coding, reg))) \
3864 { \
3865 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3866 ENCODE_DESIGNATION (charset, reg, coding); \
3867 } \
3868 } while (0)
3869
3870
3871 /* Produce designation sequences of charsets in the line started from
3872 SRC to a place pointed by DST, and return updated DST.
3873
3874 If the current block ends before any end-of-line, we may fail to
3875 find all the necessary designations. */
3876
3877 static unsigned char *
3878 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3879 struct coding_system *coding;
3880 int *charbuf, *charbuf_end;
3881 unsigned char *dst;
3882 {
3883 struct charset *charset;
3884 /* Table of charsets to be designated to each graphic register. */
3885 int r[4];
3886 int c, found = 0, reg;
3887 int produced_chars = 0;
3888 int multibytep = coding->dst_multibyte;
3889 Lisp_Object attrs;
3890 Lisp_Object charset_list;
3891
3892 attrs = CODING_ID_ATTRS (coding->id);
3893 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3894 if (EQ (charset_list, Qiso_2022))
3895 charset_list = Viso_2022_charset_list;
3896
3897 for (reg = 0; reg < 4; reg++)
3898 r[reg] = -1;
3899
3900 while (found < 4)
3901 {
3902 int id;
3903
3904 c = *charbuf++;
3905 if (c == '\n')
3906 break;
3907 charset = char_charset (c, charset_list, NULL);
3908 id = CHARSET_ID (charset);
3909 reg = CODING_ISO_REQUEST (coding, id);
3910 if (reg >= 0 && r[reg] < 0)
3911 {
3912 found++;
3913 r[reg] = id;
3914 }
3915 }
3916
3917 if (found)
3918 {
3919 for (reg = 0; reg < 4; reg++)
3920 if (r[reg] >= 0
3921 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3922 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3923 }
3924
3925 return dst;
3926 }
3927
3928 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3929
3930 static int
3931 encode_coding_iso_2022 (coding)
3932 struct coding_system *coding;
3933 {
3934 int multibytep = coding->dst_multibyte;
3935 int *charbuf = coding->charbuf;
3936 int *charbuf_end = charbuf + coding->charbuf_used;
3937 unsigned char *dst = coding->destination + coding->produced;
3938 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3939 int safe_room = 16;
3940 int bol_designation
3941 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3942 && CODING_ISO_BOL (coding));
3943 int produced_chars = 0;
3944 Lisp_Object attrs, eol_type, charset_list;
3945 int ascii_compatible;
3946 int c;
3947 int preferred_charset_id = -1;
3948
3949 CODING_GET_INFO (coding, attrs, charset_list);
3950 eol_type = CODING_ID_EOL_TYPE (coding->id);
3951 if (VECTORP (eol_type))
3952 eol_type = Qunix;
3953
3954 setup_iso_safe_charsets (attrs);
3955 /* Charset list may have been changed. */
3956 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3957 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3958
3959 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3960
3961 while (charbuf < charbuf_end)
3962 {
3963 ASSURE_DESTINATION (safe_room);
3964
3965 if (bol_designation)
3966 {
3967 unsigned char *dst_prev = dst;
3968
3969 /* We have to produce designation sequences if any now. */
3970 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3971 bol_designation = 0;
3972 /* We are sure that designation sequences are all ASCII bytes. */
3973 produced_chars += dst - dst_prev;
3974 }
3975
3976 c = *charbuf++;
3977
3978 if (c < 0)
3979 {
3980 /* Handle an annotation. */
3981 switch (*charbuf)
3982 {
3983 case CODING_ANNOTATE_COMPOSITION_MASK:
3984 /* Not yet implemented. */
3985 break;
3986 case CODING_ANNOTATE_CHARSET_MASK:
3987 preferred_charset_id = charbuf[2];
3988 if (preferred_charset_id >= 0
3989 && NILP (Fmemq (make_number (preferred_charset_id),
3990 charset_list)))
3991 preferred_charset_id = -1;
3992 break;
3993 default:
3994 abort ();
3995 }
3996 charbuf += -c - 1;
3997 continue;
3998 }
3999
4000 /* Now encode the character C. */
4001 if (c < 0x20 || c == 0x7F)
4002 {
4003 if (c == '\n'
4004 || (c == '\r' && EQ (eol_type, Qmac)))
4005 {
4006 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4007 ENCODE_RESET_PLANE_AND_REGISTER ();
4008 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4009 {
4010 int i;
4011
4012 for (i = 0; i < 4; i++)
4013 CODING_ISO_DESIGNATION (coding, i)
4014 = CODING_ISO_INITIAL (coding, i);
4015 }
4016 bol_designation
4017 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4018 }
4019 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4020 ENCODE_RESET_PLANE_AND_REGISTER ();
4021 EMIT_ONE_ASCII_BYTE (c);
4022 }
4023 else if (ASCII_CHAR_P (c))
4024 {
4025 if (ascii_compatible)
4026 EMIT_ONE_ASCII_BYTE (c);
4027 else
4028 {
4029 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4030 ENCODE_ISO_CHARACTER (charset, c);
4031 }
4032 }
4033 else if (CHAR_BYTE8_P (c))
4034 {
4035 c = CHAR_TO_BYTE8 (c);
4036 EMIT_ONE_BYTE (c);
4037 }
4038 else
4039 {
4040 struct charset *charset;
4041
4042 if (preferred_charset_id >= 0)
4043 {
4044 charset = CHARSET_FROM_ID (preferred_charset_id);
4045 if (! CHAR_CHARSET_P (c, charset))
4046 charset = char_charset (c, charset_list, NULL);
4047 }
4048 else
4049 charset = char_charset (c, charset_list, NULL);
4050 if (!charset)
4051 {
4052 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4053 {
4054 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4055 charset = CHARSET_FROM_ID (charset_ascii);
4056 }
4057 else
4058 {
4059 c = coding->default_char;
4060 charset = char_charset (c, charset_list, NULL);
4061 }
4062 }
4063 ENCODE_ISO_CHARACTER (charset, c);
4064 }
4065 }
4066
4067 if (coding->mode & CODING_MODE_LAST_BLOCK
4068 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4069 {
4070 ASSURE_DESTINATION (safe_room);
4071 ENCODE_RESET_PLANE_AND_REGISTER ();
4072 }
4073 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4074 CODING_ISO_BOL (coding) = bol_designation;
4075 coding->produced_char += produced_chars;
4076 coding->produced = dst - coding->destination;
4077 return 0;
4078 }
4079
4080 \f
4081 /*** 8,9. SJIS and BIG5 handlers ***/
4082
4083 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4084 quite widely. So, for the moment, Emacs supports them in the bare
4085 C code. But, in the future, they may be supported only by CCL. */
4086
4087 /* SJIS is a coding system encoding three character sets: ASCII, right
4088 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4089 as is. A character of charset katakana-jisx0201 is encoded by
4090 "position-code + 0x80". A character of charset japanese-jisx0208
4091 is encoded in 2-byte but two position-codes are divided and shifted
4092 so that it fit in the range below.
4093
4094 --- CODE RANGE of SJIS ---
4095 (character set) (range)
4096 ASCII 0x00 .. 0x7F
4097 KATAKANA-JISX0201 0xA0 .. 0xDF
4098 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4099 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4100 -------------------------------
4101
4102 */
4103
4104 /* BIG5 is a coding system encoding two character sets: ASCII and
4105 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4106 character set and is encoded in two-byte.
4107
4108 --- CODE RANGE of BIG5 ---
4109 (character set) (range)
4110 ASCII 0x00 .. 0x7F
4111 Big5 (1st byte) 0xA1 .. 0xFE
4112 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4113 --------------------------
4114
4115 */
4116
4117 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4118 Check if a text is encoded in SJIS. If it is, return
4119 CATEGORY_MASK_SJIS, else return 0. */
4120
4121 static int
4122 detect_coding_sjis (coding, detect_info)
4123 struct coding_system *coding;
4124 struct coding_detection_info *detect_info;
4125 {
4126 const unsigned char *src = coding->source, *src_base;
4127 const unsigned char *src_end = coding->source + coding->src_bytes;
4128 int multibytep = coding->src_multibyte;
4129 int consumed_chars = 0;
4130 int found = 0;
4131 int c;
4132
4133 detect_info->checked |= CATEGORY_MASK_SJIS;
4134 /* A coding system of this category is always ASCII compatible. */
4135 src += coding->head_ascii;
4136
4137 while (1)
4138 {
4139 src_base = src;
4140 ONE_MORE_BYTE (c);
4141 if (c < 0x80)
4142 continue;
4143 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4144 {
4145 ONE_MORE_BYTE (c);
4146 if (c < 0x40 || c == 0x7F || c > 0xFC)
4147 break;
4148 found = CATEGORY_MASK_SJIS;
4149 }
4150 else if (c >= 0xA0 && c < 0xE0)
4151 found = CATEGORY_MASK_SJIS;
4152 else
4153 break;
4154 }
4155 detect_info->rejected |= CATEGORY_MASK_SJIS;
4156 return 0;
4157
4158 no_more_source:
4159 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4160 {
4161 detect_info->rejected |= CATEGORY_MASK_SJIS;
4162 return 0;
4163 }
4164 detect_info->found |= found;
4165 return 1;
4166 }
4167
4168 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4169 Check if a text is encoded in BIG5. If it is, return
4170 CATEGORY_MASK_BIG5, else return 0. */
4171
4172 static int
4173 detect_coding_big5 (coding, detect_info)
4174 struct coding_system *coding;
4175 struct coding_detection_info *detect_info;
4176 {
4177 const unsigned char *src = coding->source, *src_base;
4178 const unsigned char *src_end = coding->source + coding->src_bytes;
4179 int multibytep = coding->src_multibyte;
4180 int consumed_chars = 0;
4181 int found = 0;
4182 int c;
4183
4184 detect_info->checked |= CATEGORY_MASK_BIG5;
4185 /* A coding system of this category is always ASCII compatible. */
4186 src += coding->head_ascii;
4187
4188 while (1)
4189 {
4190 src_base = src;
4191 ONE_MORE_BYTE (c);
4192 if (c < 0x80)
4193 continue;
4194 if (c >= 0xA1)
4195 {
4196 ONE_MORE_BYTE (c);
4197 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4198 return 0;
4199 found = CATEGORY_MASK_BIG5;
4200 }
4201 else
4202 break;
4203 }
4204 detect_info->rejected |= CATEGORY_MASK_BIG5;
4205 return 0;
4206
4207 no_more_source:
4208 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4209 {
4210 detect_info->rejected |= CATEGORY_MASK_BIG5;
4211 return 0;
4212 }
4213 detect_info->found |= found;
4214 return 1;
4215 }
4216
4217 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4218 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4219
4220 static void
4221 decode_coding_sjis (coding)
4222 struct coding_system *coding;
4223 {
4224 const unsigned char *src = coding->source + coding->consumed;
4225 const unsigned char *src_end = coding->source + coding->src_bytes;
4226 const unsigned char *src_base;
4227 int *charbuf = coding->charbuf + coding->charbuf_used;
4228 int *charbuf_end
4229 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4230 int consumed_chars = 0, consumed_chars_base;
4231 int multibytep = coding->src_multibyte;
4232 struct charset *charset_roman, *charset_kanji, *charset_kana;
4233 struct charset *charset_kanji2;
4234 Lisp_Object attrs, charset_list, val;
4235 int char_offset = coding->produced_char;
4236 int last_offset = char_offset;
4237 int last_id = charset_ascii;
4238 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4239 int byte_after_cr = -1;
4240
4241 CODING_GET_INFO (coding, attrs, charset_list);
4242
4243 val = charset_list;
4244 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4245 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4246 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4247 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4248
4249 while (1)
4250 {
4251 int c, c1;
4252 struct charset *charset;
4253
4254 src_base = src;
4255 consumed_chars_base = consumed_chars;
4256
4257 if (charbuf >= charbuf_end)
4258 break;
4259
4260 if (byte_after_cr >= 0)
4261 c = byte_after_cr, byte_after_cr = -1;
4262 else
4263 ONE_MORE_BYTE (c);
4264 if (c < 0)
4265 goto invalid_code;
4266 if (c < 0x80)
4267 {
4268 if (eol_crlf && c == '\r')
4269 ONE_MORE_BYTE (byte_after_cr);
4270 charset = charset_roman;
4271 }
4272 else if (c == 0x80 || c == 0xA0)
4273 goto invalid_code;
4274 else if (c >= 0xA1 && c <= 0xDF)
4275 {
4276 /* SJIS -> JISX0201-Kana */
4277 c &= 0x7F;
4278 charset = charset_kana;
4279 }
4280 else if (c <= 0xEF)
4281 {
4282 /* SJIS -> JISX0208 */
4283 ONE_MORE_BYTE (c1);
4284 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4285 goto invalid_code;
4286 c = (c << 8) | c1;
4287 SJIS_TO_JIS (c);
4288 charset = charset_kanji;
4289 }
4290 else if (c <= 0xFC && charset_kanji2)
4291 {
4292 /* SJIS -> JISX0213-2 */
4293 ONE_MORE_BYTE (c1);
4294 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4295 goto invalid_code;
4296 c = (c << 8) | c1;
4297 SJIS_TO_JIS2 (c);
4298 charset = charset_kanji2;
4299 }
4300 else
4301 goto invalid_code;
4302 if (charset->id != charset_ascii
4303 && last_id != charset->id)
4304 {
4305 if (last_id != charset_ascii)
4306 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4307 last_id = charset->id;
4308 last_offset = char_offset;
4309 }
4310 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4311 *charbuf++ = c;
4312 char_offset++;
4313 continue;
4314
4315 invalid_code:
4316 src = src_base;
4317 consumed_chars = consumed_chars_base;
4318 ONE_MORE_BYTE (c);
4319 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4320 char_offset++;
4321 coding->errors++;
4322 }
4323
4324 no_more_source:
4325 if (last_id != charset_ascii)
4326 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4327 coding->consumed_char += consumed_chars_base;
4328 coding->consumed = src_base - coding->source;
4329 coding->charbuf_used = charbuf - coding->charbuf;
4330 }
4331
4332 static void
4333 decode_coding_big5 (coding)
4334 struct coding_system *coding;
4335 {
4336 const unsigned char *src = coding->source + coding->consumed;
4337 const unsigned char *src_end = coding->source + coding->src_bytes;
4338 const unsigned char *src_base;
4339 int *charbuf = coding->charbuf + coding->charbuf_used;
4340 int *charbuf_end
4341 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4342 int consumed_chars = 0, consumed_chars_base;
4343 int multibytep = coding->src_multibyte;
4344 struct charset *charset_roman, *charset_big5;
4345 Lisp_Object attrs, charset_list, val;
4346 int char_offset = coding->produced_char;
4347 int last_offset = char_offset;
4348 int last_id = charset_ascii;
4349 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4350 int byte_after_cr = -1;
4351
4352 CODING_GET_INFO (coding, attrs, charset_list);
4353 val = charset_list;
4354 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4355 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4356
4357 while (1)
4358 {
4359 int c, c1;
4360 struct charset *charset;
4361
4362 src_base = src;
4363 consumed_chars_base = consumed_chars;
4364
4365 if (charbuf >= charbuf_end)
4366 break;
4367
4368 if (byte_after_cr >= 0)
4369 c = byte_after_cr, byte_after_cr = -1;
4370 else
4371 ONE_MORE_BYTE (c);
4372
4373 if (c < 0)
4374 goto invalid_code;
4375 if (c < 0x80)
4376 {
4377 if (eol_crlf && c == '\r')
4378 ONE_MORE_BYTE (byte_after_cr);
4379 charset = charset_roman;
4380 }
4381 else
4382 {
4383 /* BIG5 -> Big5 */
4384 if (c < 0xA1 || c > 0xFE)
4385 goto invalid_code;
4386 ONE_MORE_BYTE (c1);
4387 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4388 goto invalid_code;
4389 c = c << 8 | c1;
4390 charset = charset_big5;
4391 }
4392 if (charset->id != charset_ascii
4393 && last_id != charset->id)
4394 {
4395 if (last_id != charset_ascii)
4396 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4397 last_id = charset->id;
4398 last_offset = char_offset;
4399 }
4400 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4401 *charbuf++ = c;
4402 char_offset++;
4403 continue;
4404
4405 invalid_code:
4406 src = src_base;
4407 consumed_chars = consumed_chars_base;
4408 ONE_MORE_BYTE (c);
4409 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4410 char_offset++;
4411 coding->errors++;
4412 }
4413
4414 no_more_source:
4415 if (last_id != charset_ascii)
4416 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4417 coding->consumed_char += consumed_chars_base;
4418 coding->consumed = src_base - coding->source;
4419 coding->charbuf_used = charbuf - coding->charbuf;
4420 }
4421
4422 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4423 This function can encode charsets `ascii', `katakana-jisx0201',
4424 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4425 are sure that all these charsets are registered as official charset
4426 (i.e. do not have extended leading-codes). Characters of other
4427 charsets are produced without any encoding. If SJIS_P is 1, encode
4428 SJIS text, else encode BIG5 text. */
4429
4430 static int
4431 encode_coding_sjis (coding)
4432 struct coding_system *coding;
4433 {
4434 int multibytep = coding->dst_multibyte;
4435 int *charbuf = coding->charbuf;
4436 int *charbuf_end = charbuf + coding->charbuf_used;
4437 unsigned char *dst = coding->destination + coding->produced;
4438 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4439 int safe_room = 4;
4440 int produced_chars = 0;
4441 Lisp_Object attrs, charset_list, val;
4442 int ascii_compatible;
4443 struct charset *charset_roman, *charset_kanji, *charset_kana;
4444 struct charset *charset_kanji2;
4445 int c;
4446
4447 CODING_GET_INFO (coding, attrs, charset_list);
4448 val = charset_list;
4449 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4450 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4451 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4452 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4453
4454 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4455
4456 while (charbuf < charbuf_end)
4457 {
4458 ASSURE_DESTINATION (safe_room);
4459 c = *charbuf++;
4460 /* Now encode the character C. */
4461 if (ASCII_CHAR_P (c) && ascii_compatible)
4462 EMIT_ONE_ASCII_BYTE (c);
4463 else if (CHAR_BYTE8_P (c))
4464 {
4465 c = CHAR_TO_BYTE8 (c);
4466 EMIT_ONE_BYTE (c);
4467 }
4468 else
4469 {
4470 unsigned code;
4471 struct charset *charset = char_charset (c, charset_list, &code);
4472
4473 if (!charset)
4474 {
4475 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4476 {
4477 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4478 charset = CHARSET_FROM_ID (charset_ascii);
4479 }
4480 else
4481 {
4482 c = coding->default_char;
4483 charset = char_charset (c, charset_list, &code);
4484 }
4485 }
4486 if (code == CHARSET_INVALID_CODE (charset))
4487 abort ();
4488 if (charset == charset_kanji)
4489 {
4490 int c1, c2;
4491 JIS_TO_SJIS (code);
4492 c1 = code >> 8, c2 = code & 0xFF;
4493 EMIT_TWO_BYTES (c1, c2);
4494 }
4495 else if (charset == charset_kana)
4496 EMIT_ONE_BYTE (code | 0x80);
4497 else if (charset_kanji2 && charset == charset_kanji2)
4498 {
4499 int c1, c2;
4500
4501 c1 = code >> 8;
4502 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4503 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4504 {
4505 JIS_TO_SJIS2 (code);
4506 c1 = code >> 8, c2 = code & 0xFF;
4507 EMIT_TWO_BYTES (c1, c2);
4508 }
4509 else
4510 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4511 }
4512 else
4513 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4514 }
4515 }
4516 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4517 coding->produced_char += produced_chars;
4518 coding->produced = dst - coding->destination;
4519 return 0;
4520 }
4521
4522 static int
4523 encode_coding_big5 (coding)
4524 struct coding_system *coding;
4525 {
4526 int multibytep = coding->dst_multibyte;
4527 int *charbuf = coding->charbuf;
4528 int *charbuf_end = charbuf + coding->charbuf_used;
4529 unsigned char *dst = coding->destination + coding->produced;
4530 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4531 int safe_room = 4;
4532 int produced_chars = 0;
4533 Lisp_Object attrs, charset_list, val;
4534 int ascii_compatible;
4535 struct charset *charset_roman, *charset_big5;
4536 int c;
4537
4538 CODING_GET_INFO (coding, attrs, charset_list);
4539 val = charset_list;
4540 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4541 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4542 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4543
4544 while (charbuf < charbuf_end)
4545 {
4546 ASSURE_DESTINATION (safe_room);
4547 c = *charbuf++;
4548 /* Now encode the character C. */
4549 if (ASCII_CHAR_P (c) && ascii_compatible)
4550 EMIT_ONE_ASCII_BYTE (c);
4551 else if (CHAR_BYTE8_P (c))
4552 {
4553 c = CHAR_TO_BYTE8 (c);
4554 EMIT_ONE_BYTE (c);
4555 }
4556 else
4557 {
4558 unsigned code;
4559 struct charset *charset = char_charset (c, charset_list, &code);
4560
4561 if (! charset)
4562 {
4563 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4564 {
4565 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4566 charset = CHARSET_FROM_ID (charset_ascii);
4567 }
4568 else
4569 {
4570 c = coding->default_char;
4571 charset = char_charset (c, charset_list, &code);
4572 }
4573 }
4574 if (code == CHARSET_INVALID_CODE (charset))
4575 abort ();
4576 if (charset == charset_big5)
4577 {
4578 int c1, c2;
4579
4580 c1 = code >> 8, c2 = code & 0xFF;
4581 EMIT_TWO_BYTES (c1, c2);
4582 }
4583 else
4584 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4585 }
4586 }
4587 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4588 coding->produced_char += produced_chars;
4589 coding->produced = dst - coding->destination;
4590 return 0;
4591 }
4592
4593 \f
4594 /*** 10. CCL handlers ***/
4595
4596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4597 Check if a text is encoded in a coding system of which
4598 encoder/decoder are written in CCL program. If it is, return
4599 CATEGORY_MASK_CCL, else return 0. */
4600
4601 static int
4602 detect_coding_ccl (coding, detect_info)
4603 struct coding_system *coding;
4604 struct coding_detection_info *detect_info;
4605 {
4606 const unsigned char *src = coding->source, *src_base;
4607 const unsigned char *src_end = coding->source + coding->src_bytes;
4608 int multibytep = coding->src_multibyte;
4609 int consumed_chars = 0;
4610 int found = 0;
4611 unsigned char *valids;
4612 int head_ascii = coding->head_ascii;
4613 Lisp_Object attrs;
4614
4615 detect_info->checked |= CATEGORY_MASK_CCL;
4616
4617 coding = &coding_categories[coding_category_ccl];
4618 valids = CODING_CCL_VALIDS (coding);
4619 attrs = CODING_ID_ATTRS (coding->id);
4620 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4621 src += head_ascii;
4622
4623 while (1)
4624 {
4625 int c;
4626
4627 src_base = src;
4628 ONE_MORE_BYTE (c);
4629 if (c < 0 || ! valids[c])
4630 break;
4631 if ((valids[c] > 1))
4632 found = CATEGORY_MASK_CCL;
4633 }
4634 detect_info->rejected |= CATEGORY_MASK_CCL;
4635 return 0;
4636
4637 no_more_source:
4638 detect_info->found |= found;
4639 return 1;
4640 }
4641
4642 static void
4643 decode_coding_ccl (coding)
4644 struct coding_system *coding;
4645 {
4646 const unsigned char *src = coding->source + coding->consumed;
4647 const unsigned char *src_end = coding->source + coding->src_bytes;
4648 int *charbuf = coding->charbuf + coding->charbuf_used;
4649 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4650 int consumed_chars = 0;
4651 int multibytep = coding->src_multibyte;
4652 struct ccl_program ccl;
4653 int source_charbuf[1024];
4654 int source_byteidx[1024];
4655 Lisp_Object attrs, charset_list;
4656
4657 CODING_GET_INFO (coding, attrs, charset_list);
4658 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4659
4660 while (src < src_end)
4661 {
4662 const unsigned char *p = src;
4663 int *source, *source_end;
4664 int i = 0;
4665
4666 if (multibytep)
4667 while (i < 1024 && p < src_end)
4668 {
4669 source_byteidx[i] = p - src;
4670 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4671 }
4672 else
4673 while (i < 1024 && p < src_end)
4674 source_charbuf[i++] = *p++;
4675
4676 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4677 ccl.last_block = 1;
4678
4679 source = source_charbuf;
4680 source_end = source + i;
4681 while (source < source_end)
4682 {
4683 ccl_driver (&ccl, source, charbuf,
4684 source_end - source, charbuf_end - charbuf,
4685 charset_list);
4686 source += ccl.consumed;
4687 charbuf += ccl.produced;
4688 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4689 break;
4690 }
4691 if (source < source_end)
4692 src += source_byteidx[source - source_charbuf];
4693 else
4694 src = p;
4695 consumed_chars += source - source_charbuf;
4696
4697 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4698 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4699 break;
4700 }
4701
4702 switch (ccl.status)
4703 {
4704 case CCL_STAT_SUSPEND_BY_SRC:
4705 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4706 break;
4707 case CCL_STAT_SUSPEND_BY_DST:
4708 break;
4709 case CCL_STAT_QUIT:
4710 case CCL_STAT_INVALID_CMD:
4711 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4712 break;
4713 default:
4714 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4715 break;
4716 }
4717 coding->consumed_char += consumed_chars;
4718 coding->consumed = src - coding->source;
4719 coding->charbuf_used = charbuf - coding->charbuf;
4720 }
4721
4722 static int
4723 encode_coding_ccl (coding)
4724 struct coding_system *coding;
4725 {
4726 struct ccl_program ccl;
4727 int multibytep = coding->dst_multibyte;
4728 int *charbuf = coding->charbuf;
4729 int *charbuf_end = charbuf + coding->charbuf_used;
4730 unsigned char *dst = coding->destination + coding->produced;
4731 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4732 int destination_charbuf[1024];
4733 int i, produced_chars = 0;
4734 Lisp_Object attrs, charset_list;
4735
4736 CODING_GET_INFO (coding, attrs, charset_list);
4737 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4738
4739 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4740 ccl.dst_multibyte = coding->dst_multibyte;
4741
4742 while (charbuf < charbuf_end)
4743 {
4744 ccl_driver (&ccl, charbuf, destination_charbuf,
4745 charbuf_end - charbuf, 1024, charset_list);
4746 if (multibytep)
4747 {
4748 ASSURE_DESTINATION (ccl.produced * 2);
4749 for (i = 0; i < ccl.produced; i++)
4750 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4751 }
4752 else
4753 {
4754 ASSURE_DESTINATION (ccl.produced);
4755 for (i = 0; i < ccl.produced; i++)
4756 *dst++ = destination_charbuf[i] & 0xFF;
4757 produced_chars += ccl.produced;
4758 }
4759 charbuf += ccl.consumed;
4760 if (ccl.status == CCL_STAT_QUIT
4761 || ccl.status == CCL_STAT_INVALID_CMD)
4762 break;
4763 }
4764
4765 switch (ccl.status)
4766 {
4767 case CCL_STAT_SUSPEND_BY_SRC:
4768 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4769 break;
4770 case CCL_STAT_SUSPEND_BY_DST:
4771 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4772 break;
4773 case CCL_STAT_QUIT:
4774 case CCL_STAT_INVALID_CMD:
4775 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4776 break;
4777 default:
4778 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4779 break;
4780 }
4781
4782 coding->produced_char += produced_chars;
4783 coding->produced = dst - coding->destination;
4784 return 0;
4785 }
4786
4787
4788 \f
4789 /*** 10, 11. no-conversion handlers ***/
4790
4791 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4792
4793 static void
4794 decode_coding_raw_text (coding)
4795 struct coding_system *coding;
4796 {
4797 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4798
4799 coding->chars_at_source = 1;
4800 coding->consumed_char = coding->src_chars;
4801 coding->consumed = coding->src_bytes;
4802 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4803 {
4804 coding->consumed_char--;
4805 coding->consumed--;
4806 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4807 }
4808 else
4809 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4810 }
4811
4812 static int
4813 encode_coding_raw_text (coding)
4814 struct coding_system *coding;
4815 {
4816 int multibytep = coding->dst_multibyte;
4817 int *charbuf = coding->charbuf;
4818 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4819 unsigned char *dst = coding->destination + coding->produced;
4820 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4821 int produced_chars = 0;
4822 int c;
4823
4824 if (multibytep)
4825 {
4826 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4827
4828 if (coding->src_multibyte)
4829 while (charbuf < charbuf_end)
4830 {
4831 ASSURE_DESTINATION (safe_room);
4832 c = *charbuf++;
4833 if (ASCII_CHAR_P (c))
4834 EMIT_ONE_ASCII_BYTE (c);
4835 else if (CHAR_BYTE8_P (c))
4836 {
4837 c = CHAR_TO_BYTE8 (c);
4838 EMIT_ONE_BYTE (c);
4839 }
4840 else
4841 {
4842 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4843
4844 CHAR_STRING_ADVANCE (c, p1);
4845 while (p0 < p1)
4846 {
4847 EMIT_ONE_BYTE (*p0);
4848 p0++;
4849 }
4850 }
4851 }
4852 else
4853 while (charbuf < charbuf_end)
4854 {
4855 ASSURE_DESTINATION (safe_room);
4856 c = *charbuf++;
4857 EMIT_ONE_BYTE (c);
4858 }
4859 }
4860 else
4861 {
4862 if (coding->src_multibyte)
4863 {
4864 int safe_room = MAX_MULTIBYTE_LENGTH;
4865
4866 while (charbuf < charbuf_end)
4867 {
4868 ASSURE_DESTINATION (safe_room);
4869 c = *charbuf++;
4870 if (ASCII_CHAR_P (c))
4871 *dst++ = c;
4872 else if (CHAR_BYTE8_P (c))
4873 *dst++ = CHAR_TO_BYTE8 (c);
4874 else
4875 CHAR_STRING_ADVANCE (c, dst);
4876 produced_chars++;
4877 }
4878 }
4879 else
4880 {
4881 ASSURE_DESTINATION (charbuf_end - charbuf);
4882 while (charbuf < charbuf_end && dst < dst_end)
4883 *dst++ = *charbuf++;
4884 produced_chars = dst - (coding->destination + coding->dst_bytes);
4885 }
4886 }
4887 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4888 coding->produced_char += produced_chars;
4889 coding->produced = dst - coding->destination;
4890 return 0;
4891 }
4892
4893 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4894 Check if a text is encoded in a charset-based coding system. If it
4895 is, return 1, else return 0. */
4896
4897 static int
4898 detect_coding_charset (coding, detect_info)
4899 struct coding_system *coding;
4900 struct coding_detection_info *detect_info;
4901 {
4902 const unsigned char *src = coding->source, *src_base;
4903 const unsigned char *src_end = coding->source + coding->src_bytes;
4904 int multibytep = coding->src_multibyte;
4905 int consumed_chars = 0;
4906 Lisp_Object attrs, valids;
4907 int found = 0;
4908 int head_ascii = coding->head_ascii;
4909
4910 detect_info->checked |= CATEGORY_MASK_CHARSET;
4911
4912 coding = &coding_categories[coding_category_charset];
4913 attrs = CODING_ID_ATTRS (coding->id);
4914 valids = AREF (attrs, coding_attr_charset_valids);
4915
4916 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4917 src += head_ascii;
4918
4919 while (1)
4920 {
4921 int c;
4922 Lisp_Object val;
4923 struct charset *charset;
4924 int dim, idx;
4925
4926 src_base = src;
4927 ONE_MORE_BYTE (c);
4928 if (c < 0)
4929 continue;
4930 val = AREF (valids, c);
4931 if (NILP (val))
4932 break;
4933 if (c >= 0x80)
4934 found = CATEGORY_MASK_CHARSET;
4935 if (INTEGERP (val))
4936 {
4937 charset = CHARSET_FROM_ID (XFASTINT (val));
4938 dim = CHARSET_DIMENSION (charset);
4939 for (idx = 1; idx < dim; idx++)
4940 {
4941 if (src == src_end)
4942 goto too_short;
4943 ONE_MORE_BYTE (c);
4944 if (c < charset->code_space[(dim - 1 - idx) * 2]
4945 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4946 break;
4947 }
4948 if (idx < dim)
4949 break;
4950 }
4951 else
4952 {
4953 idx = 1;
4954 for (; CONSP (val); val = XCDR (val))
4955 {
4956 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4957 dim = CHARSET_DIMENSION (charset);
4958 while (idx < dim)
4959 {
4960 if (src == src_end)
4961 goto too_short;
4962 ONE_MORE_BYTE (c);
4963 if (c < charset->code_space[(dim - 1 - idx) * 4]
4964 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4965 break;
4966 idx++;
4967 }
4968 if (idx == dim)
4969 {
4970 val = Qnil;
4971 break;
4972 }
4973 }
4974 if (CONSP (val))
4975 break;
4976 }
4977 }
4978 too_short:
4979 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4980 return 0;
4981
4982 no_more_source:
4983 detect_info->found |= found;
4984 return 1;
4985 }
4986
4987 static void
4988 decode_coding_charset (coding)
4989 struct coding_system *coding;
4990 {
4991 const unsigned char *src = coding->source + coding->consumed;
4992 const unsigned char *src_end = coding->source + coding->src_bytes;
4993 const unsigned char *src_base;
4994 int *charbuf = coding->charbuf + coding->charbuf_used;
4995 int *charbuf_end
4996 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4997 int consumed_chars = 0, consumed_chars_base;
4998 int multibytep = coding->src_multibyte;
4999 Lisp_Object attrs, charset_list, valids;
5000 int char_offset = coding->produced_char;
5001 int last_offset = char_offset;
5002 int last_id = charset_ascii;
5003 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5004 int byte_after_cr = -1;
5005
5006 CODING_GET_INFO (coding, attrs, charset_list);
5007 valids = AREF (attrs, coding_attr_charset_valids);
5008
5009 while (1)
5010 {
5011 int c;
5012 Lisp_Object val;
5013 struct charset *charset;
5014 int dim;
5015 int len = 1;
5016 unsigned code;
5017
5018 src_base = src;
5019 consumed_chars_base = consumed_chars;
5020
5021 if (charbuf >= charbuf_end)
5022 break;
5023
5024 if (byte_after_cr >= 0)
5025 {
5026 c = byte_after_cr;
5027 byte_after_cr = -1;
5028 }
5029 else
5030 {
5031 ONE_MORE_BYTE (c);
5032 if (eol_crlf && c == '\r')
5033 ONE_MORE_BYTE (byte_after_cr);
5034 }
5035 if (c < 0)
5036 goto invalid_code;
5037 code = c;
5038
5039 val = AREF (valids, c);
5040 if (NILP (val))
5041 goto invalid_code;
5042 if (INTEGERP (val))
5043 {
5044 charset = CHARSET_FROM_ID (XFASTINT (val));
5045 dim = CHARSET_DIMENSION (charset);
5046 while (len < dim)
5047 {
5048 ONE_MORE_BYTE (c);
5049 code = (code << 8) | c;
5050 len++;
5051 }
5052 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5053 charset, code, c);
5054 }
5055 else
5056 {
5057 /* VAL is a list of charset IDs. It is assured that the
5058 list is sorted by charset dimensions (smaller one
5059 comes first). */
5060 while (CONSP (val))
5061 {
5062 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5063 dim = CHARSET_DIMENSION (charset);
5064 while (len < dim)
5065 {
5066 ONE_MORE_BYTE (c);
5067 code = (code << 8) | c;
5068 len++;
5069 }
5070 CODING_DECODE_CHAR (coding, src, src_base,
5071 src_end, charset, code, c);
5072 if (c >= 0)
5073 break;
5074 val = XCDR (val);
5075 }
5076 }
5077 if (c < 0)
5078 goto invalid_code;
5079 if (charset->id != charset_ascii
5080 && last_id != charset->id)
5081 {
5082 if (last_id != charset_ascii)
5083 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5084 last_id = charset->id;
5085 last_offset = char_offset;
5086 }
5087
5088 *charbuf++ = c;
5089 char_offset++;
5090 continue;
5091
5092 invalid_code:
5093 src = src_base;
5094 consumed_chars = consumed_chars_base;
5095 ONE_MORE_BYTE (c);
5096 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5097 char_offset++;
5098 coding->errors++;
5099 }
5100
5101 no_more_source:
5102 if (last_id != charset_ascii)
5103 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5104 coding->consumed_char += consumed_chars_base;
5105 coding->consumed = src_base - coding->source;
5106 coding->charbuf_used = charbuf - coding->charbuf;
5107 }
5108
5109 static int
5110 encode_coding_charset (coding)
5111 struct coding_system *coding;
5112 {
5113 int multibytep = coding->dst_multibyte;
5114 int *charbuf = coding->charbuf;
5115 int *charbuf_end = charbuf + coding->charbuf_used;
5116 unsigned char *dst = coding->destination + coding->produced;
5117 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5118 int safe_room = MAX_MULTIBYTE_LENGTH;
5119 int produced_chars = 0;
5120 Lisp_Object attrs, charset_list;
5121 int ascii_compatible;
5122 int c;
5123
5124 CODING_GET_INFO (coding, attrs, charset_list);
5125 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5126
5127 while (charbuf < charbuf_end)
5128 {
5129 struct charset *charset;
5130 unsigned code;
5131
5132 ASSURE_DESTINATION (safe_room);
5133 c = *charbuf++;
5134 if (ascii_compatible && ASCII_CHAR_P (c))
5135 EMIT_ONE_ASCII_BYTE (c);
5136 else if (CHAR_BYTE8_P (c))
5137 {
5138 c = CHAR_TO_BYTE8 (c);
5139 EMIT_ONE_BYTE (c);
5140 }
5141 else
5142 {
5143 charset = char_charset (c, charset_list, &code);
5144 if (charset)
5145 {
5146 if (CHARSET_DIMENSION (charset) == 1)
5147 EMIT_ONE_BYTE (code);
5148 else if (CHARSET_DIMENSION (charset) == 2)
5149 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5150 else if (CHARSET_DIMENSION (charset) == 3)
5151 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5152 else
5153 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5154 (code >> 8) & 0xFF, code & 0xFF);
5155 }
5156 else
5157 {
5158 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5159 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5160 else
5161 c = coding->default_char;
5162 EMIT_ONE_BYTE (c);
5163 }
5164 }
5165 }
5166
5167 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5168 coding->produced_char += produced_chars;
5169 coding->produced = dst - coding->destination;
5170 return 0;
5171 }
5172
5173 \f
5174 /*** 7. C library functions ***/
5175
5176 /* Setup coding context CODING from information about CODING_SYSTEM.
5177 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5178 CODING_SYSTEM is invalid, signal an error. */
5179
5180 void
5181 setup_coding_system (coding_system, coding)
5182 Lisp_Object coding_system;
5183 struct coding_system *coding;
5184 {
5185 Lisp_Object attrs;
5186 Lisp_Object eol_type;
5187 Lisp_Object coding_type;
5188 Lisp_Object val;
5189
5190 if (NILP (coding_system))
5191 coding_system = Qundecided;
5192
5193 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5194
5195 attrs = CODING_ID_ATTRS (coding->id);
5196 eol_type = CODING_ID_EOL_TYPE (coding->id);
5197
5198 coding->mode = 0;
5199 coding->head_ascii = -1;
5200 if (VECTORP (eol_type))
5201 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5202 | CODING_REQUIRE_DETECTION_MASK);
5203 else if (! EQ (eol_type, Qunix))
5204 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5205 | CODING_REQUIRE_ENCODING_MASK);
5206 else
5207 coding->common_flags = 0;
5208 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5209 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5210 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5211 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5212 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5213 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5214
5215 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5216 coding->max_charset_id = SCHARS (val) - 1;
5217 coding->safe_charsets = (char *) SDATA (val);
5218 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5219
5220 coding_type = CODING_ATTR_TYPE (attrs);
5221 if (EQ (coding_type, Qundecided))
5222 {
5223 coding->detector = NULL;
5224 coding->decoder = decode_coding_raw_text;
5225 coding->encoder = encode_coding_raw_text;
5226 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5227 }
5228 else if (EQ (coding_type, Qiso_2022))
5229 {
5230 int i;
5231 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5232
5233 /* Invoke graphic register 0 to plane 0. */
5234 CODING_ISO_INVOCATION (coding, 0) = 0;
5235 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5236 CODING_ISO_INVOCATION (coding, 1)
5237 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5238 /* Setup the initial status of designation. */
5239 for (i = 0; i < 4; i++)
5240 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5241 /* Not single shifting initially. */
5242 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5243 /* Beginning of buffer should also be regarded as bol. */
5244 CODING_ISO_BOL (coding) = 1;
5245 coding->detector = detect_coding_iso_2022;
5246 coding->decoder = decode_coding_iso_2022;
5247 coding->encoder = encode_coding_iso_2022;
5248 if (flags & CODING_ISO_FLAG_SAFE)
5249 coding->mode |= CODING_MODE_SAFE_ENCODING;
5250 coding->common_flags
5251 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5252 | CODING_REQUIRE_FLUSHING_MASK);
5253 if (flags & CODING_ISO_FLAG_COMPOSITION)
5254 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5255 if (flags & CODING_ISO_FLAG_DESIGNATION)
5256 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5257 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5258 {
5259 setup_iso_safe_charsets (attrs);
5260 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5261 coding->max_charset_id = SCHARS (val) - 1;
5262 coding->safe_charsets = (char *) SDATA (val);
5263 }
5264 CODING_ISO_FLAGS (coding) = flags;
5265 }
5266 else if (EQ (coding_type, Qcharset))
5267 {
5268 coding->detector = detect_coding_charset;
5269 coding->decoder = decode_coding_charset;
5270 coding->encoder = encode_coding_charset;
5271 coding->common_flags
5272 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5273 }
5274 else if (EQ (coding_type, Qutf_8))
5275 {
5276 coding->detector = detect_coding_utf_8;
5277 coding->decoder = decode_coding_utf_8;
5278 coding->encoder = encode_coding_utf_8;
5279 coding->common_flags
5280 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5281 }
5282 else if (EQ (coding_type, Qutf_16))
5283 {
5284 val = AREF (attrs, coding_attr_utf_16_bom);
5285 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5286 : EQ (val, Qt) ? utf_16_with_bom
5287 : utf_16_without_bom);
5288 val = AREF (attrs, coding_attr_utf_16_endian);
5289 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5290 : utf_16_little_endian);
5291 CODING_UTF_16_SURROGATE (coding) = 0;
5292 coding->detector = detect_coding_utf_16;
5293 coding->decoder = decode_coding_utf_16;
5294 coding->encoder = encode_coding_utf_16;
5295 coding->common_flags
5296 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5297 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5298 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5299 }
5300 else if (EQ (coding_type, Qccl))
5301 {
5302 coding->detector = detect_coding_ccl;
5303 coding->decoder = decode_coding_ccl;
5304 coding->encoder = encode_coding_ccl;
5305 coding->common_flags
5306 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5307 | CODING_REQUIRE_FLUSHING_MASK);
5308 }
5309 else if (EQ (coding_type, Qemacs_mule))
5310 {
5311 coding->detector = detect_coding_emacs_mule;
5312 coding->decoder = decode_coding_emacs_mule;
5313 coding->encoder = encode_coding_emacs_mule;
5314 coding->common_flags
5315 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5316 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5317 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5318 {
5319 Lisp_Object tail, safe_charsets;
5320 int max_charset_id = 0;
5321
5322 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5323 tail = XCDR (tail))
5324 if (max_charset_id < XFASTINT (XCAR (tail)))
5325 max_charset_id = XFASTINT (XCAR (tail));
5326 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5327 make_number (255));
5328 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5329 tail = XCDR (tail))
5330 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5331 coding->max_charset_id = max_charset_id;
5332 coding->safe_charsets = (char *) SDATA (safe_charsets);
5333 }
5334 }
5335 else if (EQ (coding_type, Qshift_jis))
5336 {
5337 coding->detector = detect_coding_sjis;
5338 coding->decoder = decode_coding_sjis;
5339 coding->encoder = encode_coding_sjis;
5340 coding->common_flags
5341 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5342 }
5343 else if (EQ (coding_type, Qbig5))
5344 {
5345 coding->detector = detect_coding_big5;
5346 coding->decoder = decode_coding_big5;
5347 coding->encoder = encode_coding_big5;
5348 coding->common_flags
5349 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5350 }
5351 else /* EQ (coding_type, Qraw_text) */
5352 {
5353 coding->detector = NULL;
5354 coding->decoder = decode_coding_raw_text;
5355 coding->encoder = encode_coding_raw_text;
5356 if (! EQ (eol_type, Qunix))
5357 {
5358 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5359 if (! VECTORP (eol_type))
5360 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5361 }
5362
5363 }
5364
5365 return;
5366 }
5367
5368 /* Return a list of charsets supported by CODING. */
5369
5370 Lisp_Object
5371 coding_charset_list (coding)
5372 struct coding_system *coding;
5373 {
5374 Lisp_Object attrs, charset_list;
5375
5376 CODING_GET_INFO (coding, attrs, charset_list);
5377 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5378 {
5379 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5380
5381 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5382 charset_list = Viso_2022_charset_list;
5383 }
5384 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5385 {
5386 charset_list = Vemacs_mule_charset_list;
5387 }
5388 return charset_list;
5389 }
5390
5391
5392 /* Return raw-text or one of its subsidiaries that has the same
5393 eol_type as CODING-SYSTEM. */
5394
5395 Lisp_Object
5396 raw_text_coding_system (coding_system)
5397 Lisp_Object coding_system;
5398 {
5399 Lisp_Object spec, attrs;
5400 Lisp_Object eol_type, raw_text_eol_type;
5401
5402 if (NILP (coding_system))
5403 return Qraw_text;
5404 spec = CODING_SYSTEM_SPEC (coding_system);
5405 attrs = AREF (spec, 0);
5406
5407 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5408 return coding_system;
5409
5410 eol_type = AREF (spec, 2);
5411 if (VECTORP (eol_type))
5412 return Qraw_text;
5413 spec = CODING_SYSTEM_SPEC (Qraw_text);
5414 raw_text_eol_type = AREF (spec, 2);
5415 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5416 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5417 : AREF (raw_text_eol_type, 2));
5418 }
5419
5420
5421 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5422 does, return one of the subsidiary that has the same eol-spec as
5423 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5424 inherit end-of-line format from the system's setting
5425 (system_eol_type). */
5426
5427 Lisp_Object
5428 coding_inherit_eol_type (coding_system, parent)
5429 Lisp_Object coding_system, parent;
5430 {
5431 Lisp_Object spec, eol_type;
5432
5433 if (NILP (coding_system))
5434 coding_system = Qraw_text;
5435 spec = CODING_SYSTEM_SPEC (coding_system);
5436 eol_type = AREF (spec, 2);
5437 if (VECTORP (eol_type))
5438 {
5439 Lisp_Object parent_eol_type;
5440
5441 if (! NILP (parent))
5442 {
5443 Lisp_Object parent_spec;
5444
5445 parent_spec = CODING_SYSTEM_SPEC (parent);
5446 parent_eol_type = AREF (parent_spec, 2);
5447 }
5448 else
5449 parent_eol_type = system_eol_type;
5450 if (EQ (parent_eol_type, Qunix))
5451 coding_system = AREF (eol_type, 0);
5452 else if (EQ (parent_eol_type, Qdos))
5453 coding_system = AREF (eol_type, 1);
5454 else if (EQ (parent_eol_type, Qmac))
5455 coding_system = AREF (eol_type, 2);
5456 }
5457 return coding_system;
5458 }
5459
5460 /* Emacs has a mechanism to automatically detect a coding system if it
5461 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5462 it's impossible to distinguish some coding systems accurately
5463 because they use the same range of codes. So, at first, coding
5464 systems are categorized into 7, those are:
5465
5466 o coding-category-emacs-mule
5467
5468 The category for a coding system which has the same code range
5469 as Emacs' internal format. Assigned the coding-system (Lisp
5470 symbol) `emacs-mule' by default.
5471
5472 o coding-category-sjis
5473
5474 The category for a coding system which has the same code range
5475 as SJIS. Assigned the coding-system (Lisp
5476 symbol) `japanese-shift-jis' by default.
5477
5478 o coding-category-iso-7
5479
5480 The category for a coding system which has the same code range
5481 as ISO2022 of 7-bit environment. This doesn't use any locking
5482 shift and single shift functions. This can encode/decode all
5483 charsets. Assigned the coding-system (Lisp symbol)
5484 `iso-2022-7bit' by default.
5485
5486 o coding-category-iso-7-tight
5487
5488 Same as coding-category-iso-7 except that this can
5489 encode/decode only the specified charsets.
5490
5491 o coding-category-iso-8-1
5492
5493 The category for a coding system which has the same code range
5494 as ISO2022 of 8-bit environment and graphic plane 1 used only
5495 for DIMENSION1 charset. This doesn't use any locking shift
5496 and single shift functions. Assigned the coding-system (Lisp
5497 symbol) `iso-latin-1' by default.
5498
5499 o coding-category-iso-8-2
5500
5501 The category for a coding system which has the same code range
5502 as ISO2022 of 8-bit environment and graphic plane 1 used only
5503 for DIMENSION2 charset. This doesn't use any locking shift
5504 and single shift functions. Assigned the coding-system (Lisp
5505 symbol) `japanese-iso-8bit' by default.
5506
5507 o coding-category-iso-7-else
5508
5509 The category for a coding system which has the same code range
5510 as ISO2022 of 7-bit environemnt but uses locking shift or
5511 single shift functions. Assigned the coding-system (Lisp
5512 symbol) `iso-2022-7bit-lock' by default.
5513
5514 o coding-category-iso-8-else
5515
5516 The category for a coding system which has the same code range
5517 as ISO2022 of 8-bit environemnt but uses locking shift or
5518 single shift functions. Assigned the coding-system (Lisp
5519 symbol) `iso-2022-8bit-ss2' by default.
5520
5521 o coding-category-big5
5522
5523 The category for a coding system which has the same code range
5524 as BIG5. Assigned the coding-system (Lisp symbol)
5525 `cn-big5' by default.
5526
5527 o coding-category-utf-8
5528
5529 The category for a coding system which has the same code range
5530 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5531 symbol) `utf-8' by default.
5532
5533 o coding-category-utf-16-be
5534
5535 The category for a coding system in which a text has an
5536 Unicode signature (cf. Unicode Standard) in the order of BIG
5537 endian at the head. Assigned the coding-system (Lisp symbol)
5538 `utf-16-be' by default.
5539
5540 o coding-category-utf-16-le
5541
5542 The category for a coding system in which a text has an
5543 Unicode signature (cf. Unicode Standard) in the order of
5544 LITTLE endian at the head. Assigned the coding-system (Lisp
5545 symbol) `utf-16-le' by default.
5546
5547 o coding-category-ccl
5548
5549 The category for a coding system of which encoder/decoder is
5550 written in CCL programs. The default value is nil, i.e., no
5551 coding system is assigned.
5552
5553 o coding-category-binary
5554
5555 The category for a coding system not categorized in any of the
5556 above. Assigned the coding-system (Lisp symbol)
5557 `no-conversion' by default.
5558
5559 Each of them is a Lisp symbol and the value is an actual
5560 `coding-system's (this is also a Lisp symbol) assigned by a user.
5561 What Emacs does actually is to detect a category of coding system.
5562 Then, it uses a `coding-system' assigned to it. If Emacs can't
5563 decide only one possible category, it selects a category of the
5564 highest priority. Priorities of categories are also specified by a
5565 user in a Lisp variable `coding-category-list'.
5566
5567 */
5568
5569 #define EOL_SEEN_NONE 0
5570 #define EOL_SEEN_LF 1
5571 #define EOL_SEEN_CR 2
5572 #define EOL_SEEN_CRLF 4
5573
5574 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5575 SOURCE is encoded. If CATEGORY is one of
5576 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5577 two-byte, else they are encoded by one-byte.
5578
5579 Return one of EOL_SEEN_XXX. */
5580
5581 #define MAX_EOL_CHECK_COUNT 3
5582
5583 static int
5584 detect_eol (source, src_bytes, category)
5585 const unsigned char *source;
5586 EMACS_INT src_bytes;
5587 enum coding_category category;
5588 {
5589 const unsigned char *src = source, *src_end = src + src_bytes;
5590 unsigned char c;
5591 int total = 0;
5592 int eol_seen = EOL_SEEN_NONE;
5593
5594 if ((1 << category) & CATEGORY_MASK_UTF_16)
5595 {
5596 int msb, lsb;
5597
5598 msb = category == (coding_category_utf_16_le
5599 | coding_category_utf_16_le_nosig);
5600 lsb = 1 - msb;
5601
5602 while (src + 1 < src_end)
5603 {
5604 c = src[lsb];
5605 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5606 {
5607 int this_eol;
5608
5609 if (c == '\n')
5610 this_eol = EOL_SEEN_LF;
5611 else if (src + 3 >= src_end
5612 || src[msb + 2] != 0
5613 || src[lsb + 2] != '\n')
5614 this_eol = EOL_SEEN_CR;
5615 else
5616 this_eol = EOL_SEEN_CRLF;
5617
5618 if (eol_seen == EOL_SEEN_NONE)
5619 /* This is the first end-of-line. */
5620 eol_seen = this_eol;
5621 else if (eol_seen != this_eol)
5622 {
5623 /* The found type is different from what found before. */
5624 eol_seen = EOL_SEEN_LF;
5625 break;
5626 }
5627 if (++total == MAX_EOL_CHECK_COUNT)
5628 break;
5629 }
5630 src += 2;
5631 }
5632 }
5633 else
5634 {
5635 while (src < src_end)
5636 {
5637 c = *src++;
5638 if (c == '\n' || c == '\r')
5639 {
5640 int this_eol;
5641
5642 if (c == '\n')
5643 this_eol = EOL_SEEN_LF;
5644 else if (src >= src_end || *src != '\n')
5645 this_eol = EOL_SEEN_CR;
5646 else
5647 this_eol = EOL_SEEN_CRLF, src++;
5648
5649 if (eol_seen == EOL_SEEN_NONE)
5650 /* This is the first end-of-line. */
5651 eol_seen = this_eol;
5652 else if (eol_seen != this_eol)
5653 {
5654 /* The found type is different from what found before. */
5655 eol_seen = EOL_SEEN_LF;
5656 break;
5657 }
5658 if (++total == MAX_EOL_CHECK_COUNT)
5659 break;
5660 }
5661 }
5662 }
5663 return eol_seen;
5664 }
5665
5666
5667 static Lisp_Object
5668 adjust_coding_eol_type (coding, eol_seen)
5669 struct coding_system *coding;
5670 int eol_seen;
5671 {
5672 Lisp_Object eol_type;
5673
5674 eol_type = CODING_ID_EOL_TYPE (coding->id);
5675 if (eol_seen & EOL_SEEN_LF)
5676 {
5677 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5678 eol_type = Qunix;
5679 }
5680 else if (eol_seen & EOL_SEEN_CRLF)
5681 {
5682 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5683 eol_type = Qdos;
5684 }
5685 else if (eol_seen & EOL_SEEN_CR)
5686 {
5687 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5688 eol_type = Qmac;
5689 }
5690 return eol_type;
5691 }
5692
5693 /* Detect how a text specified in CODING is encoded. If a coding
5694 system is detected, update fields of CODING by the detected coding
5695 system. */
5696
5697 void
5698 detect_coding (coding)
5699 struct coding_system *coding;
5700 {
5701 const unsigned char *src, *src_end;
5702
5703 coding->consumed = coding->consumed_char = 0;
5704 coding->produced = coding->produced_char = 0;
5705 coding_set_source (coding);
5706
5707 src_end = coding->source + coding->src_bytes;
5708
5709 /* If we have not yet decided the text encoding type, detect it
5710 now. */
5711 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5712 {
5713 int c, i;
5714 struct coding_detection_info detect_info;
5715 int null_byte_found = 0, eight_bit_found = 0;
5716
5717 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5718 coding->head_ascii = -1;
5719 for (src = coding->source; src < src_end; src++)
5720 {
5721 c = *src;
5722 if (c & 0x80)
5723 {
5724 eight_bit_found = 1;
5725 if (coding->head_ascii < 0)
5726 coding->head_ascii = src - coding->source;
5727 if (null_byte_found)
5728 break;
5729 }
5730 else if (c < 0x20)
5731 {
5732 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5733 && ! inhibit_iso_escape_detection
5734 && ! detect_info.checked)
5735 {
5736 if (coding->head_ascii < 0)
5737 coding->head_ascii = src - coding->source;
5738 if (detect_coding_iso_2022 (coding, &detect_info))
5739 {
5740 /* We have scanned the whole data. */
5741 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5742 /* We didn't find an 8-bit code. We may have
5743 found a null-byte, but it's very rare that
5744 a binary file confirm to ISO-2022. */
5745 src = src_end;
5746 break;
5747 }
5748 }
5749 else if (! c)
5750 {
5751 null_byte_found = 1;
5752 if (eight_bit_found)
5753 break;
5754 }
5755 }
5756 }
5757 if (coding->head_ascii < 0)
5758 coding->head_ascii = src - coding->source;
5759
5760 if (null_byte_found || eight_bit_found
5761 || coding->head_ascii < coding->src_bytes
5762 || detect_info.found)
5763 {
5764 enum coding_category category;
5765 struct coding_system *this;
5766
5767 if (coding->head_ascii == coding->src_bytes)
5768 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5769 for (i = 0; i < coding_category_raw_text; i++)
5770 {
5771 category = coding_priorities[i];
5772 this = coding_categories + category;
5773 if (detect_info.found & (1 << category))
5774 break;
5775 }
5776 else
5777 {
5778 if (null_byte_found)
5779 {
5780 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5781 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5782 }
5783 for (i = 0; i < coding_category_raw_text; i++)
5784 {
5785 category = coding_priorities[i];
5786 this = coding_categories + category;
5787 if (this->id < 0)
5788 {
5789 /* No coding system of this category is defined. */
5790 detect_info.rejected |= (1 << category);
5791 }
5792 else if (category >= coding_category_raw_text)
5793 continue;
5794 else if (detect_info.checked & (1 << category))
5795 {
5796 if (detect_info.found & (1 << category))
5797 break;
5798 }
5799 else if ((*(this->detector)) (coding, &detect_info)
5800 && detect_info.found & (1 << category))
5801 {
5802 if (category == coding_category_utf_16_auto)
5803 {
5804 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5805 category = coding_category_utf_16_le;
5806 else
5807 category = coding_category_utf_16_be;
5808 }
5809 break;
5810 }
5811 }
5812
5813 if (i < coding_category_raw_text)
5814 setup_coding_system (CODING_ID_NAME (this->id), coding);
5815 else if (null_byte_found)
5816 setup_coding_system (Qno_conversion, coding);
5817 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5818 == CATEGORY_MASK_ANY)
5819 setup_coding_system (Qraw_text, coding);
5820 else if (detect_info.rejected)
5821 for (i = 0; i < coding_category_raw_text; i++)
5822 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5823 {
5824 this = coding_categories + coding_priorities[i];
5825 setup_coding_system (CODING_ID_NAME (this->id), coding);
5826 break;
5827 }
5828 }
5829 }
5830 }
5831 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5832 == coding_category_utf_16_auto)
5833 {
5834 Lisp_Object coding_systems;
5835 struct coding_detection_info detect_info;
5836
5837 coding_systems
5838 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5839 detect_info.found = detect_info.rejected = 0;
5840 if (CONSP (coding_systems)
5841 && detect_coding_utf_16 (coding, &detect_info))
5842 {
5843 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5844 setup_coding_system (XCAR (coding_systems), coding);
5845 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5846 setup_coding_system (XCDR (coding_systems), coding);
5847 }
5848 }
5849 }
5850
5851
5852 static void
5853 decode_eol (coding)
5854 struct coding_system *coding;
5855 {
5856 Lisp_Object eol_type;
5857 unsigned char *p, *pbeg, *pend;
5858
5859 eol_type = CODING_ID_EOL_TYPE (coding->id);
5860 if (EQ (eol_type, Qunix))
5861 return;
5862
5863 if (NILP (coding->dst_object))
5864 pbeg = coding->destination;
5865 else
5866 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5867 pend = pbeg + coding->produced;
5868
5869 if (VECTORP (eol_type))
5870 {
5871 int eol_seen = EOL_SEEN_NONE;
5872
5873 for (p = pbeg; p < pend; p++)
5874 {
5875 if (*p == '\n')
5876 eol_seen |= EOL_SEEN_LF;
5877 else if (*p == '\r')
5878 {
5879 if (p + 1 < pend && *(p + 1) == '\n')
5880 {
5881 eol_seen |= EOL_SEEN_CRLF;
5882 p++;
5883 }
5884 else
5885 eol_seen |= EOL_SEEN_CR;
5886 }
5887 }
5888 if (eol_seen != EOL_SEEN_NONE
5889 && eol_seen != EOL_SEEN_LF
5890 && eol_seen != EOL_SEEN_CRLF
5891 && eol_seen != EOL_SEEN_CR)
5892 eol_seen = EOL_SEEN_LF;
5893 if (eol_seen != EOL_SEEN_NONE)
5894 eol_type = adjust_coding_eol_type (coding, eol_seen);
5895 }
5896
5897 if (EQ (eol_type, Qmac))
5898 {
5899 for (p = pbeg; p < pend; p++)
5900 if (*p == '\r')
5901 *p = '\n';
5902 }
5903 else if (EQ (eol_type, Qdos))
5904 {
5905 int n = 0;
5906
5907 if (NILP (coding->dst_object))
5908 {
5909 /* Start deleting '\r' from the tail to minimize the memory
5910 movement. */
5911 for (p = pend - 2; p >= pbeg; p--)
5912 if (*p == '\r')
5913 {
5914 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5915 n++;
5916 }
5917 }
5918 else
5919 {
5920 int pos_byte = coding->dst_pos_byte;
5921 int pos = coding->dst_pos;
5922 int pos_end = pos + coding->produced_char - 1;
5923
5924 while (pos < pos_end)
5925 {
5926 p = BYTE_POS_ADDR (pos_byte);
5927 if (*p == '\r' && p[1] == '\n')
5928 {
5929 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5930 n++;
5931 pos_end--;
5932 }
5933 pos++;
5934 if (coding->dst_multibyte)
5935 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5936 else
5937 pos_byte++;
5938 }
5939 }
5940 coding->produced -= n;
5941 coding->produced_char -= n;
5942 }
5943 }
5944
5945
5946 /* Return a translation table (or list of them) from coding system
5947 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5948 decoding (ENCODEP is zero). */
5949
5950 static Lisp_Object
5951 get_translation_table (attrs, encodep, max_lookup)
5952 Lisp_Object attrs;
5953 int encodep, *max_lookup;
5954 {
5955 Lisp_Object standard, translation_table;
5956 Lisp_Object val;
5957
5958 if (encodep)
5959 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5960 standard = Vstandard_translation_table_for_encode;
5961 else
5962 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5963 standard = Vstandard_translation_table_for_decode;
5964 if (NILP (translation_table))
5965 translation_table = standard;
5966 else
5967 {
5968 if (SYMBOLP (translation_table))
5969 translation_table = Fget (translation_table, Qtranslation_table);
5970 else if (CONSP (translation_table))
5971 {
5972 translation_table = Fcopy_sequence (translation_table);
5973 for (val = translation_table; CONSP (val); val = XCDR (val))
5974 if (SYMBOLP (XCAR (val)))
5975 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5976 }
5977 if (CHAR_TABLE_P (standard))
5978 {
5979 if (CONSP (translation_table))
5980 translation_table = nconc2 (translation_table,
5981 Fcons (standard, Qnil));
5982 else
5983 translation_table = Fcons (translation_table,
5984 Fcons (standard, Qnil));
5985 }
5986 }
5987
5988 if (max_lookup)
5989 {
5990 *max_lookup = 1;
5991 if (CHAR_TABLE_P (translation_table)
5992 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5993 {
5994 val = XCHAR_TABLE (translation_table)->extras[1];
5995 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5996 *max_lookup = XFASTINT (val);
5997 }
5998 else if (CONSP (translation_table))
5999 {
6000 Lisp_Object tail, val;
6001
6002 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6003 if (CHAR_TABLE_P (XCAR (tail))
6004 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6005 {
6006 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6007 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6008 *max_lookup = XFASTINT (val);
6009 }
6010 }
6011 }
6012 return translation_table;
6013 }
6014
6015 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6016 do { \
6017 trans = Qnil; \
6018 if (CHAR_TABLE_P (table)) \
6019 { \
6020 trans = CHAR_TABLE_REF (table, c); \
6021 if (CHARACTERP (trans)) \
6022 c = XFASTINT (trans), trans = Qnil; \
6023 } \
6024 else if (CONSP (table)) \
6025 { \
6026 Lisp_Object tail; \
6027 \
6028 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6029 if (CHAR_TABLE_P (XCAR (tail))) \
6030 { \
6031 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6032 if (CHARACTERP (trans)) \
6033 c = XFASTINT (trans), trans = Qnil; \
6034 else if (! NILP (trans)) \
6035 break; \
6036 } \
6037 } \
6038 } while (0)
6039
6040
6041 static Lisp_Object
6042 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6043 Lisp_Object val;
6044 int *buf, *buf_end;
6045 int last_block;
6046 int *from_nchars, *to_nchars;
6047 {
6048 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6049 [TO-CHAR ...]. */
6050 if (CONSP (val))
6051 {
6052 Lisp_Object from, tail;
6053 int i, len;
6054
6055 for (tail = val; CONSP (tail); tail = XCDR (tail))
6056 {
6057 val = XCAR (tail);
6058 from = XCAR (val);
6059 len = ASIZE (from);
6060 for (i = 0; i < len; i++)
6061 {
6062 if (buf + i == buf_end)
6063 {
6064 if (! last_block)
6065 return Qt;
6066 break;
6067 }
6068 if (XINT (AREF (from, i)) != buf[i])
6069 break;
6070 }
6071 if (i == len)
6072 {
6073 val = XCDR (val);
6074 *from_nchars = len;
6075 break;
6076 }
6077 }
6078 if (! CONSP (tail))
6079 return Qnil;
6080 }
6081 if (VECTORP (val))
6082 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6083 else
6084 *buf = XINT (val);
6085 return val;
6086 }
6087
6088
6089 static int
6090 produce_chars (coding, translation_table, last_block)
6091 struct coding_system *coding;
6092 Lisp_Object translation_table;
6093 int last_block;
6094 {
6095 unsigned char *dst = coding->destination + coding->produced;
6096 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6097 EMACS_INT produced;
6098 EMACS_INT produced_chars = 0;
6099 int carryover = 0;
6100
6101 if (! coding->chars_at_source)
6102 {
6103 /* Source characters are in coding->charbuf. */
6104 int *buf = coding->charbuf;
6105 int *buf_end = buf + coding->charbuf_used;
6106
6107 if (EQ (coding->src_object, coding->dst_object))
6108 {
6109 coding_set_source (coding);
6110 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6111 }
6112
6113 while (buf < buf_end)
6114 {
6115 int c = *buf, i;
6116
6117 if (c >= 0)
6118 {
6119 int from_nchars = 1, to_nchars = 1;
6120 Lisp_Object trans = Qnil;
6121
6122 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6123 if (! NILP (trans))
6124 {
6125 trans = get_translation (trans, buf, buf_end, last_block,
6126 &from_nchars, &to_nchars);
6127 if (EQ (trans, Qt))
6128 break;
6129 c = *buf;
6130 }
6131
6132 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6133 {
6134 dst = alloc_destination (coding,
6135 buf_end - buf
6136 + MAX_MULTIBYTE_LENGTH * to_nchars,
6137 dst);
6138 if (EQ (coding->src_object, coding->dst_object))
6139 {
6140 coding_set_source (coding);
6141 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6142 }
6143 else
6144 dst_end = coding->destination + coding->dst_bytes;
6145 }
6146
6147 for (i = 0; i < to_nchars; i++)
6148 {
6149 if (i > 0)
6150 c = XINT (AREF (trans, i));
6151 if (coding->dst_multibyte
6152 || ! CHAR_BYTE8_P (c))
6153 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6154 else
6155 *dst++ = CHAR_TO_BYTE8 (c);
6156 }
6157 produced_chars += to_nchars;
6158 *buf++ = to_nchars;
6159 while (--from_nchars > 0)
6160 *buf++ = 0;
6161 }
6162 else
6163 /* This is an annotation datum. (-C) is the length. */
6164 buf += -c;
6165 }
6166 carryover = buf_end - buf;
6167 }
6168 else
6169 {
6170 /* Source characters are at coding->source. */
6171 const unsigned char *src = coding->source;
6172 const unsigned char *src_end = src + coding->consumed;
6173
6174 if (EQ (coding->dst_object, coding->src_object))
6175 dst_end = (unsigned char *) src;
6176 if (coding->src_multibyte != coding->dst_multibyte)
6177 {
6178 if (coding->src_multibyte)
6179 {
6180 int multibytep = 1;
6181 EMACS_INT consumed_chars;
6182
6183 while (1)
6184 {
6185 const unsigned char *src_base = src;
6186 int c;
6187
6188 ONE_MORE_BYTE (c);
6189 if (dst == dst_end)
6190 {
6191 if (EQ (coding->src_object, coding->dst_object))
6192 dst_end = (unsigned char *) src;
6193 if (dst == dst_end)
6194 {
6195 EMACS_INT offset = src - coding->source;
6196
6197 dst = alloc_destination (coding, src_end - src + 1,
6198 dst);
6199 dst_end = coding->destination + coding->dst_bytes;
6200 coding_set_source (coding);
6201 src = coding->source + offset;
6202 src_end = coding->source + coding->src_bytes;
6203 if (EQ (coding->src_object, coding->dst_object))
6204 dst_end = (unsigned char *) src;
6205 }
6206 }
6207 *dst++ = c;
6208 produced_chars++;
6209 }
6210 no_more_source:
6211 ;
6212 }
6213 else
6214 while (src < src_end)
6215 {
6216 int multibytep = 1;
6217 int c = *src++;
6218
6219 if (dst >= dst_end - 1)
6220 {
6221 if (EQ (coding->src_object, coding->dst_object))
6222 dst_end = (unsigned char *) src;
6223 if (dst >= dst_end - 1)
6224 {
6225 EMACS_INT offset = src - coding->source;
6226 EMACS_INT more_bytes;
6227
6228 if (EQ (coding->src_object, coding->dst_object))
6229 more_bytes = ((src_end - src) / 2) + 2;
6230 else
6231 more_bytes = src_end - src + 2;
6232 dst = alloc_destination (coding, more_bytes, dst);
6233 dst_end = coding->destination + coding->dst_bytes;
6234 coding_set_source (coding);
6235 src = coding->source + offset;
6236 src_end = coding->source + coding->src_bytes;
6237 if (EQ (coding->src_object, coding->dst_object))
6238 dst_end = (unsigned char *) src;
6239 }
6240 }
6241 EMIT_ONE_BYTE (c);
6242 }
6243 }
6244 else
6245 {
6246 if (!EQ (coding->src_object, coding->dst_object))
6247 {
6248 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6249
6250 if (require > 0)
6251 {
6252 EMACS_INT offset = src - coding->source;
6253
6254 dst = alloc_destination (coding, require, dst);
6255 coding_set_source (coding);
6256 src = coding->source + offset;
6257 src_end = coding->source + coding->src_bytes;
6258 }
6259 }
6260 produced_chars = coding->consumed_char;
6261 while (src < src_end)
6262 *dst++ = *src++;
6263 }
6264 }
6265
6266 produced = dst - (coding->destination + coding->produced);
6267 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6268 insert_from_gap (produced_chars, produced);
6269 coding->produced += produced;
6270 coding->produced_char += produced_chars;
6271 return carryover;
6272 }
6273
6274 /* Compose text in CODING->object according to the annotation data at
6275 CHARBUF. CHARBUF is an array:
6276 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6277 */
6278
6279 static INLINE void
6280 produce_composition (coding, charbuf, pos)
6281 struct coding_system *coding;
6282 int *charbuf;
6283 EMACS_INT pos;
6284 {
6285 int len;
6286 EMACS_INT to;
6287 enum composition_method method;
6288 Lisp_Object components;
6289
6290 len = -charbuf[0];
6291 to = pos + charbuf[2];
6292 if (to <= pos)
6293 return;
6294 method = (enum composition_method) (charbuf[3]);
6295
6296 if (method == COMPOSITION_RELATIVE)
6297 components = Qnil;
6298 else if (method >= COMPOSITION_WITH_RULE
6299 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6300 {
6301 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6302 int i;
6303
6304 len -= 4;
6305 charbuf += 4;
6306 for (i = 0; i < len; i++)
6307 {
6308 args[i] = make_number (charbuf[i]);
6309 if (charbuf[i] < 0)
6310 return;
6311 }
6312 components = (method == COMPOSITION_WITH_ALTCHARS
6313 ? Fstring (len, args) : Fvector (len, args));
6314 }
6315 else
6316 return;
6317 compose_text (pos, to, components, Qnil, coding->dst_object);
6318 }
6319
6320
6321 /* Put `charset' property on text in CODING->object according to
6322 the annotation data at CHARBUF. CHARBUF is an array:
6323 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6324 */
6325
6326 static INLINE void
6327 produce_charset (coding, charbuf, pos)
6328 struct coding_system *coding;
6329 int *charbuf;
6330 EMACS_INT pos;
6331 {
6332 EMACS_INT from = pos - charbuf[2];
6333 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6334
6335 Fput_text_property (make_number (from), make_number (pos),
6336 Qcharset, CHARSET_NAME (charset),
6337 coding->dst_object);
6338 }
6339
6340
6341 #define CHARBUF_SIZE 0x4000
6342
6343 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6344 do { \
6345 int size = CHARBUF_SIZE;; \
6346 \
6347 coding->charbuf = NULL; \
6348 while (size > 1024) \
6349 { \
6350 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6351 if (coding->charbuf) \
6352 break; \
6353 size >>= 1; \
6354 } \
6355 if (! coding->charbuf) \
6356 { \
6357 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6358 return coding->result; \
6359 } \
6360 coding->charbuf_size = size; \
6361 } while (0)
6362
6363
6364 static void
6365 produce_annotation (coding, pos)
6366 struct coding_system *coding;
6367 EMACS_INT pos;
6368 {
6369 int *charbuf = coding->charbuf;
6370 int *charbuf_end = charbuf + coding->charbuf_used;
6371
6372 if (NILP (coding->dst_object))
6373 return;
6374
6375 while (charbuf < charbuf_end)
6376 {
6377 if (*charbuf >= 0)
6378 pos += *charbuf++;
6379 else
6380 {
6381 int len = -*charbuf;
6382 switch (charbuf[1])
6383 {
6384 case CODING_ANNOTATE_COMPOSITION_MASK:
6385 produce_composition (coding, charbuf, pos);
6386 break;
6387 case CODING_ANNOTATE_CHARSET_MASK:
6388 produce_charset (coding, charbuf, pos);
6389 break;
6390 default:
6391 abort ();
6392 }
6393 charbuf += len;
6394 }
6395 }
6396 }
6397
6398 /* Decode the data at CODING->src_object into CODING->dst_object.
6399 CODING->src_object is a buffer, a string, or nil.
6400 CODING->dst_object is a buffer.
6401
6402 If CODING->src_object is a buffer, it must be the current buffer.
6403 In this case, if CODING->src_pos is positive, it is a position of
6404 the source text in the buffer, otherwise, the source text is in the
6405 gap area of the buffer, and CODING->src_pos specifies the offset of
6406 the text from GPT (which must be the same as PT). If this is the
6407 same buffer as CODING->dst_object, CODING->src_pos must be
6408 negative.
6409
6410 If CODING->src_object is a string, CODING->src_pos is an index to
6411 that string.
6412
6413 If CODING->src_object is nil, CODING->source must already point to
6414 the non-relocatable memory area. In this case, CODING->src_pos is
6415 an offset from CODING->source.
6416
6417 The decoded data is inserted at the current point of the buffer
6418 CODING->dst_object.
6419 */
6420
6421 static int
6422 decode_coding (coding)
6423 struct coding_system *coding;
6424 {
6425 Lisp_Object attrs;
6426 Lisp_Object undo_list;
6427 Lisp_Object translation_table;
6428 int carryover;
6429 int i;
6430
6431 if (BUFFERP (coding->src_object)
6432 && coding->src_pos > 0
6433 && coding->src_pos < GPT
6434 && coding->src_pos + coding->src_chars > GPT)
6435 move_gap_both (coding->src_pos, coding->src_pos_byte);
6436
6437 undo_list = Qt;
6438 if (BUFFERP (coding->dst_object))
6439 {
6440 if (current_buffer != XBUFFER (coding->dst_object))
6441 set_buffer_internal (XBUFFER (coding->dst_object));
6442 if (GPT != PT)
6443 move_gap_both (PT, PT_BYTE);
6444 undo_list = current_buffer->undo_list;
6445 current_buffer->undo_list = Qt;
6446 }
6447
6448 coding->consumed = coding->consumed_char = 0;
6449 coding->produced = coding->produced_char = 0;
6450 coding->chars_at_source = 0;
6451 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6452 coding->errors = 0;
6453
6454 ALLOC_CONVERSION_WORK_AREA (coding);
6455
6456 attrs = CODING_ID_ATTRS (coding->id);
6457 translation_table = get_translation_table (attrs, 0, NULL);
6458
6459 carryover = 0;
6460 do
6461 {
6462 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6463
6464 coding_set_source (coding);
6465 coding->annotated = 0;
6466 coding->charbuf_used = carryover;
6467 (*(coding->decoder)) (coding);
6468 coding_set_destination (coding);
6469 carryover = produce_chars (coding, translation_table, 0);
6470 if (coding->annotated)
6471 produce_annotation (coding, pos);
6472 for (i = 0; i < carryover; i++)
6473 coding->charbuf[i]
6474 = coding->charbuf[coding->charbuf_used - carryover + i];
6475 }
6476 while (coding->consumed < coding->src_bytes
6477 && (coding->result == CODING_RESULT_SUCCESS
6478 || coding->result == CODING_RESULT_INVALID_SRC));
6479
6480 if (carryover > 0)
6481 {
6482 coding_set_destination (coding);
6483 coding->charbuf_used = carryover;
6484 produce_chars (coding, translation_table, 1);
6485 }
6486
6487 coding->carryover_bytes = 0;
6488 if (coding->consumed < coding->src_bytes)
6489 {
6490 int nbytes = coding->src_bytes - coding->consumed;
6491 const unsigned char *src;
6492
6493 coding_set_source (coding);
6494 coding_set_destination (coding);
6495 src = coding->source + coding->consumed;
6496
6497 if (coding->mode & CODING_MODE_LAST_BLOCK)
6498 {
6499 /* Flush out unprocessed data as binary chars. We are sure
6500 that the number of data is less than the size of
6501 coding->charbuf. */
6502 coding->charbuf_used = 0;
6503 while (nbytes-- > 0)
6504 {
6505 int c = *src++;
6506
6507 if (c & 0x80)
6508 c = BYTE8_TO_CHAR (c);
6509 coding->charbuf[coding->charbuf_used++] = c;
6510 }
6511 produce_chars (coding, Qnil, 1);
6512 }
6513 else
6514 {
6515 /* Record unprocessed bytes in coding->carryover. We are
6516 sure that the number of data is less than the size of
6517 coding->carryover. */
6518 unsigned char *p = coding->carryover;
6519
6520 coding->carryover_bytes = nbytes;
6521 while (nbytes-- > 0)
6522 *p++ = *src++;
6523 }
6524 coding->consumed = coding->src_bytes;
6525 }
6526
6527 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6528 decode_eol (coding);
6529 if (BUFFERP (coding->dst_object))
6530 {
6531 current_buffer->undo_list = undo_list;
6532 record_insert (coding->dst_pos, coding->produced_char);
6533 }
6534 return coding->result;
6535 }
6536
6537
6538 /* Extract an annotation datum from a composition starting at POS and
6539 ending before LIMIT of CODING->src_object (buffer or string), store
6540 the data in BUF, set *STOP to a starting position of the next
6541 composition (if any) or to LIMIT, and return the address of the
6542 next element of BUF.
6543
6544 If such an annotation is not found, set *STOP to a starting
6545 position of a composition after POS (if any) or to LIMIT, and
6546 return BUF. */
6547
6548 static INLINE int *
6549 handle_composition_annotation (pos, limit, coding, buf, stop)
6550 EMACS_INT pos, limit;
6551 struct coding_system *coding;
6552 int *buf;
6553 EMACS_INT *stop;
6554 {
6555 EMACS_INT start, end;
6556 Lisp_Object prop;
6557
6558 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6559 || end > limit)
6560 *stop = limit;
6561 else if (start > pos)
6562 *stop = start;
6563 else
6564 {
6565 if (start == pos)
6566 {
6567 /* We found a composition. Store the corresponding
6568 annotation data in BUF. */
6569 int *head = buf;
6570 enum composition_method method = COMPOSITION_METHOD (prop);
6571 int nchars = COMPOSITION_LENGTH (prop);
6572
6573 ADD_COMPOSITION_DATA (buf, nchars, method);
6574 if (method != COMPOSITION_RELATIVE)
6575 {
6576 Lisp_Object components;
6577 int len, i, i_byte;
6578
6579 components = COMPOSITION_COMPONENTS (prop);
6580 if (VECTORP (components))
6581 {
6582 len = XVECTOR (components)->size;
6583 for (i = 0; i < len; i++)
6584 *buf++ = XINT (AREF (components, i));
6585 }
6586 else if (STRINGP (components))
6587 {
6588 len = SCHARS (components);
6589 i = i_byte = 0;
6590 while (i < len)
6591 {
6592 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6593 buf++;
6594 }
6595 }
6596 else if (INTEGERP (components))
6597 {
6598 len = 1;
6599 *buf++ = XINT (components);
6600 }
6601 else if (CONSP (components))
6602 {
6603 for (len = 0; CONSP (components);
6604 len++, components = XCDR (components))
6605 *buf++ = XINT (XCAR (components));
6606 }
6607 else
6608 abort ();
6609 *head -= len;
6610 }
6611 }
6612
6613 if (find_composition (end, limit, &start, &end, &prop,
6614 coding->src_object)
6615 && end <= limit)
6616 *stop = start;
6617 else
6618 *stop = limit;
6619 }
6620 return buf;
6621 }
6622
6623
6624 /* Extract an annotation datum from a text property `charset' at POS of
6625 CODING->src_object (buffer of string), store the data in BUF, set
6626 *STOP to the position where the value of `charset' property changes
6627 (limiting by LIMIT), and return the address of the next element of
6628 BUF.
6629
6630 If the property value is nil, set *STOP to the position where the
6631 property value is non-nil (limiting by LIMIT), and return BUF. */
6632
6633 static INLINE int *
6634 handle_charset_annotation (pos, limit, coding, buf, stop)
6635 EMACS_INT pos, limit;
6636 struct coding_system *coding;
6637 int *buf;
6638 EMACS_INT *stop;
6639 {
6640 Lisp_Object val, next;
6641 int id;
6642
6643 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6644 if (! NILP (val) && CHARSETP (val))
6645 id = XINT (CHARSET_SYMBOL_ID (val));
6646 else
6647 id = -1;
6648 ADD_CHARSET_DATA (buf, 0, id);
6649 next = Fnext_single_property_change (make_number (pos), Qcharset,
6650 coding->src_object,
6651 make_number (limit));
6652 *stop = XINT (next);
6653 return buf;
6654 }
6655
6656
6657 static void
6658 consume_chars (coding, translation_table, max_lookup)
6659 struct coding_system *coding;
6660 Lisp_Object translation_table;
6661 int max_lookup;
6662 {
6663 int *buf = coding->charbuf;
6664 int *buf_end = coding->charbuf + coding->charbuf_size;
6665 const unsigned char *src = coding->source + coding->consumed;
6666 const unsigned char *src_end = coding->source + coding->src_bytes;
6667 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6668 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6669 int multibytep = coding->src_multibyte;
6670 Lisp_Object eol_type;
6671 int c;
6672 EMACS_INT stop, stop_composition, stop_charset;
6673 int *lookup_buf = NULL;
6674
6675 if (! NILP (translation_table))
6676 lookup_buf = alloca (sizeof (int) * max_lookup);
6677
6678 eol_type = CODING_ID_EOL_TYPE (coding->id);
6679 if (VECTORP (eol_type))
6680 eol_type = Qunix;
6681
6682 /* Note: composition handling is not yet implemented. */
6683 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6684
6685 if (NILP (coding->src_object))
6686 stop = stop_composition = stop_charset = end_pos;
6687 else
6688 {
6689 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6690 stop = stop_composition = pos;
6691 else
6692 stop = stop_composition = end_pos;
6693 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6694 stop = stop_charset = pos;
6695 else
6696 stop_charset = end_pos;
6697 }
6698
6699 /* Compensate for CRLF and conversion. */
6700 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6701 while (buf < buf_end)
6702 {
6703 Lisp_Object trans;
6704
6705 if (pos == stop)
6706 {
6707 if (pos == end_pos)
6708 break;
6709 if (pos == stop_composition)
6710 buf = handle_composition_annotation (pos, end_pos, coding,
6711 buf, &stop_composition);
6712 if (pos == stop_charset)
6713 buf = handle_charset_annotation (pos, end_pos, coding,
6714 buf, &stop_charset);
6715 stop = (stop_composition < stop_charset
6716 ? stop_composition : stop_charset);
6717 }
6718
6719 if (! multibytep)
6720 {
6721 EMACS_INT bytes;
6722
6723 if (coding->encoder == encode_coding_raw_text)
6724 c = *src++, pos++;
6725 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6726 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6727 else
6728 c = BYTE8_TO_CHAR (*src), src++, pos++;
6729 }
6730 else
6731 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6732 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6733 c = '\n';
6734 if (! EQ (eol_type, Qunix))
6735 {
6736 if (c == '\n')
6737 {
6738 if (EQ (eol_type, Qdos))
6739 *buf++ = '\r';
6740 else
6741 c = '\r';
6742 }
6743 }
6744
6745 trans = Qnil;
6746 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6747 if (NILP (trans))
6748 *buf++ = c;
6749 else
6750 {
6751 int from_nchars = 1, to_nchars = 1;
6752 int *lookup_buf_end;
6753 const unsigned char *p = src;
6754 int i;
6755
6756 lookup_buf[0] = c;
6757 for (i = 1; i < max_lookup && p < src_end; i++)
6758 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6759 lookup_buf_end = lookup_buf + i;
6760 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6761 &from_nchars, &to_nchars);
6762 if (EQ (trans, Qt)
6763 || buf + to_nchars > buf_end)
6764 break;
6765 *buf++ = *lookup_buf;
6766 for (i = 1; i < to_nchars; i++)
6767 *buf++ = XINT (AREF (trans, i));
6768 for (i = 1; i < from_nchars; i++, pos++)
6769 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6770 }
6771 }
6772
6773 coding->consumed = src - coding->source;
6774 coding->consumed_char = pos - coding->src_pos;
6775 coding->charbuf_used = buf - coding->charbuf;
6776 coding->chars_at_source = 0;
6777 }
6778
6779
6780 /* Encode the text at CODING->src_object into CODING->dst_object.
6781 CODING->src_object is a buffer or a string.
6782 CODING->dst_object is a buffer or nil.
6783
6784 If CODING->src_object is a buffer, it must be the current buffer.
6785 In this case, if CODING->src_pos is positive, it is a position of
6786 the source text in the buffer, otherwise. the source text is in the
6787 gap area of the buffer, and coding->src_pos specifies the offset of
6788 the text from GPT (which must be the same as PT). If this is the
6789 same buffer as CODING->dst_object, CODING->src_pos must be
6790 negative and CODING should not have `pre-write-conversion'.
6791
6792 If CODING->src_object is a string, CODING should not have
6793 `pre-write-conversion'.
6794
6795 If CODING->dst_object is a buffer, the encoded data is inserted at
6796 the current point of that buffer.
6797
6798 If CODING->dst_object is nil, the encoded data is placed at the
6799 memory area specified by CODING->destination. */
6800
6801 static int
6802 encode_coding (coding)
6803 struct coding_system *coding;
6804 {
6805 Lisp_Object attrs;
6806 Lisp_Object translation_table;
6807 int max_lookup;
6808
6809 attrs = CODING_ID_ATTRS (coding->id);
6810 if (coding->encoder == encode_coding_raw_text)
6811 translation_table = Qnil, max_lookup = 0;
6812 else
6813 translation_table = get_translation_table (attrs, 1, &max_lookup);
6814
6815 if (BUFFERP (coding->dst_object))
6816 {
6817 set_buffer_internal (XBUFFER (coding->dst_object));
6818 coding->dst_multibyte
6819 = ! NILP (current_buffer->enable_multibyte_characters);
6820 }
6821
6822 coding->consumed = coding->consumed_char = 0;
6823 coding->produced = coding->produced_char = 0;
6824 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6825 coding->errors = 0;
6826
6827 ALLOC_CONVERSION_WORK_AREA (coding);
6828
6829 do {
6830 coding_set_source (coding);
6831 consume_chars (coding, translation_table, max_lookup);
6832 coding_set_destination (coding);
6833 (*(coding->encoder)) (coding);
6834 } while (coding->consumed_char < coding->src_chars);
6835
6836 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6837 insert_from_gap (coding->produced_char, coding->produced);
6838
6839 return (coding->result);
6840 }
6841
6842
6843 /* Name (or base name) of work buffer for code conversion. */
6844 static Lisp_Object Vcode_conversion_workbuf_name;
6845
6846 /* A working buffer used by the top level conversion. Once it is
6847 created, it is never destroyed. It has the name
6848 Vcode_conversion_workbuf_name. The other working buffers are
6849 destroyed after the use is finished, and their names are modified
6850 versions of Vcode_conversion_workbuf_name. */
6851 static Lisp_Object Vcode_conversion_reused_workbuf;
6852
6853 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6854 static int reused_workbuf_in_use;
6855
6856
6857 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6858 multibyteness of returning buffer. */
6859
6860 static Lisp_Object
6861 make_conversion_work_buffer (multibyte)
6862 int multibyte;
6863 {
6864 Lisp_Object name, workbuf;
6865 struct buffer *current;
6866
6867 if (reused_workbuf_in_use++)
6868 {
6869 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6870 workbuf = Fget_buffer_create (name);
6871 }
6872 else
6873 {
6874 name = Vcode_conversion_workbuf_name;
6875 workbuf = Fget_buffer_create (name);
6876 if (NILP (Vcode_conversion_reused_workbuf))
6877 Vcode_conversion_reused_workbuf = workbuf;
6878 }
6879 current = current_buffer;
6880 set_buffer_internal (XBUFFER (workbuf));
6881 Ferase_buffer ();
6882 current_buffer->undo_list = Qt;
6883 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6884 set_buffer_internal (current);
6885 return workbuf;
6886 }
6887
6888
6889 static Lisp_Object
6890 code_conversion_restore (arg)
6891 Lisp_Object arg;
6892 {
6893 Lisp_Object current, workbuf;
6894 struct gcpro gcpro1;
6895
6896 GCPRO1 (arg);
6897 current = XCAR (arg);
6898 workbuf = XCDR (arg);
6899 if (! NILP (workbuf))
6900 {
6901 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6902 reused_workbuf_in_use = 0;
6903 else if (! NILP (Fbuffer_live_p (workbuf)))
6904 Fkill_buffer (workbuf);
6905 }
6906 set_buffer_internal (XBUFFER (current));
6907 UNGCPRO;
6908 return Qnil;
6909 }
6910
6911 Lisp_Object
6912 code_conversion_save (with_work_buf, multibyte)
6913 int with_work_buf, multibyte;
6914 {
6915 Lisp_Object workbuf = Qnil;
6916
6917 if (with_work_buf)
6918 workbuf = make_conversion_work_buffer (multibyte);
6919 record_unwind_protect (code_conversion_restore,
6920 Fcons (Fcurrent_buffer (), workbuf));
6921 return workbuf;
6922 }
6923
6924 int
6925 decode_coding_gap (coding, chars, bytes)
6926 struct coding_system *coding;
6927 EMACS_INT chars, bytes;
6928 {
6929 int count = specpdl_ptr - specpdl;
6930 Lisp_Object attrs;
6931
6932 code_conversion_save (0, 0);
6933
6934 coding->src_object = Fcurrent_buffer ();
6935 coding->src_chars = chars;
6936 coding->src_bytes = bytes;
6937 coding->src_pos = -chars;
6938 coding->src_pos_byte = -bytes;
6939 coding->src_multibyte = chars < bytes;
6940 coding->dst_object = coding->src_object;
6941 coding->dst_pos = PT;
6942 coding->dst_pos_byte = PT_BYTE;
6943 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6944
6945 if (CODING_REQUIRE_DETECTION (coding))
6946 detect_coding (coding);
6947
6948 coding->mode |= CODING_MODE_LAST_BLOCK;
6949 current_buffer->text->inhibit_shrinking = 1;
6950 decode_coding (coding);
6951 current_buffer->text->inhibit_shrinking = 0;
6952
6953 attrs = CODING_ID_ATTRS (coding->id);
6954 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6955 {
6956 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6957 Lisp_Object val;
6958
6959 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6960 val = call1 (CODING_ATTR_POST_READ (attrs),
6961 make_number (coding->produced_char));
6962 CHECK_NATNUM (val);
6963 coding->produced_char += Z - prev_Z;
6964 coding->produced += Z_BYTE - prev_Z_BYTE;
6965 }
6966
6967 unbind_to (count, Qnil);
6968 return coding->result;
6969 }
6970
6971 int
6972 encode_coding_gap (coding, chars, bytes)
6973 struct coding_system *coding;
6974 EMACS_INT chars, bytes;
6975 {
6976 int count = specpdl_ptr - specpdl;
6977
6978 code_conversion_save (0, 0);
6979
6980 coding->src_object = Fcurrent_buffer ();
6981 coding->src_chars = chars;
6982 coding->src_bytes = bytes;
6983 coding->src_pos = -chars;
6984 coding->src_pos_byte = -bytes;
6985 coding->src_multibyte = chars < bytes;
6986 coding->dst_object = coding->src_object;
6987 coding->dst_pos = PT;
6988 coding->dst_pos_byte = PT_BYTE;
6989
6990 encode_coding (coding);
6991
6992 unbind_to (count, Qnil);
6993 return coding->result;
6994 }
6995
6996
6997 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6998 SRC_OBJECT into DST_OBJECT by coding context CODING.
6999
7000 SRC_OBJECT is a buffer, a string, or Qnil.
7001
7002 If it is a buffer, the text is at point of the buffer. FROM and TO
7003 are positions in the buffer.
7004
7005 If it is a string, the text is at the beginning of the string.
7006 FROM and TO are indices to the string.
7007
7008 If it is nil, the text is at coding->source. FROM and TO are
7009 indices to coding->source.
7010
7011 DST_OBJECT is a buffer, Qt, or Qnil.
7012
7013 If it is a buffer, the decoded text is inserted at point of the
7014 buffer. If the buffer is the same as SRC_OBJECT, the source text
7015 is deleted.
7016
7017 If it is Qt, a string is made from the decoded text, and
7018 set in CODING->dst_object.
7019
7020 If it is Qnil, the decoded text is stored at CODING->destination.
7021 The caller must allocate CODING->dst_bytes bytes at
7022 CODING->destination by xmalloc. If the decoded text is longer than
7023 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7024 */
7025
7026 void
7027 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7028 dst_object)
7029 struct coding_system *coding;
7030 Lisp_Object src_object;
7031 EMACS_INT from, from_byte, to, to_byte;
7032 Lisp_Object dst_object;
7033 {
7034 int count = specpdl_ptr - specpdl;
7035 unsigned char *destination;
7036 EMACS_INT dst_bytes;
7037 EMACS_INT chars = to - from;
7038 EMACS_INT bytes = to_byte - from_byte;
7039 Lisp_Object attrs;
7040 int saved_pt = -1, saved_pt_byte;
7041 int need_marker_adjustment = 0;
7042 Lisp_Object old_deactivate_mark;
7043
7044 old_deactivate_mark = Vdeactivate_mark;
7045
7046 if (NILP (dst_object))
7047 {
7048 destination = coding->destination;
7049 dst_bytes = coding->dst_bytes;
7050 }
7051
7052 coding->src_object = src_object;
7053 coding->src_chars = chars;
7054 coding->src_bytes = bytes;
7055 coding->src_multibyte = chars < bytes;
7056
7057 if (STRINGP (src_object))
7058 {
7059 coding->src_pos = from;
7060 coding->src_pos_byte = from_byte;
7061 }
7062 else if (BUFFERP (src_object))
7063 {
7064 set_buffer_internal (XBUFFER (src_object));
7065 if (from != GPT)
7066 move_gap_both (from, from_byte);
7067 if (EQ (src_object, dst_object))
7068 {
7069 struct Lisp_Marker *tail;
7070
7071 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7072 {
7073 tail->need_adjustment
7074 = tail->charpos == (tail->insertion_type ? from : to);
7075 need_marker_adjustment |= tail->need_adjustment;
7076 }
7077 saved_pt = PT, saved_pt_byte = PT_BYTE;
7078 TEMP_SET_PT_BOTH (from, from_byte);
7079 current_buffer->text->inhibit_shrinking = 1;
7080 del_range_both (from, from_byte, to, to_byte, 1);
7081 coding->src_pos = -chars;
7082 coding->src_pos_byte = -bytes;
7083 }
7084 else
7085 {
7086 coding->src_pos = from;
7087 coding->src_pos_byte = from_byte;
7088 }
7089 }
7090
7091 if (CODING_REQUIRE_DETECTION (coding))
7092 detect_coding (coding);
7093 attrs = CODING_ID_ATTRS (coding->id);
7094
7095 if (EQ (dst_object, Qt)
7096 || (! NILP (CODING_ATTR_POST_READ (attrs))
7097 && NILP (dst_object)))
7098 {
7099 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7100 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7101 coding->dst_pos = BEG;
7102 coding->dst_pos_byte = BEG_BYTE;
7103 }
7104 else if (BUFFERP (dst_object))
7105 {
7106 code_conversion_save (0, 0);
7107 coding->dst_object = dst_object;
7108 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7109 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7110 coding->dst_multibyte
7111 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7112 }
7113 else
7114 {
7115 code_conversion_save (0, 0);
7116 coding->dst_object = Qnil;
7117 /* Most callers presume this will return a multibyte result, and they
7118 won't use `binary' or `raw-text' anyway, so let's not worry about
7119 CODING_FOR_UNIBYTE. */
7120 coding->dst_multibyte = 1;
7121 }
7122
7123 decode_coding (coding);
7124
7125 if (BUFFERP (coding->dst_object))
7126 set_buffer_internal (XBUFFER (coding->dst_object));
7127
7128 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7129 {
7130 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7131 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7132 Lisp_Object val;
7133
7134 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7135 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7136 old_deactivate_mark);
7137 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7138 make_number (coding->produced_char));
7139 UNGCPRO;
7140 CHECK_NATNUM (val);
7141 coding->produced_char += Z - prev_Z;
7142 coding->produced += Z_BYTE - prev_Z_BYTE;
7143 }
7144
7145 if (EQ (dst_object, Qt))
7146 {
7147 coding->dst_object = Fbuffer_string ();
7148 }
7149 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7150 {
7151 set_buffer_internal (XBUFFER (coding->dst_object));
7152 if (dst_bytes < coding->produced)
7153 {
7154 destination = xrealloc (destination, coding->produced);
7155 if (! destination)
7156 {
7157 record_conversion_result (coding,
7158 CODING_RESULT_INSUFFICIENT_DST);
7159 unbind_to (count, Qnil);
7160 return;
7161 }
7162 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7163 move_gap_both (BEGV, BEGV_BYTE);
7164 bcopy (BEGV_ADDR, destination, coding->produced);
7165 coding->destination = destination;
7166 }
7167 }
7168
7169 if (saved_pt >= 0)
7170 {
7171 /* This is the case of:
7172 (BUFFERP (src_object) && EQ (src_object, dst_object))
7173 As we have moved PT while replacing the original buffer
7174 contents, we must recover it now. */
7175 set_buffer_internal (XBUFFER (src_object));
7176 current_buffer->text->inhibit_shrinking = 0;
7177 if (saved_pt < from)
7178 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7179 else if (saved_pt < from + chars)
7180 TEMP_SET_PT_BOTH (from, from_byte);
7181 else if (! NILP (current_buffer->enable_multibyte_characters))
7182 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7183 saved_pt_byte + (coding->produced - bytes));
7184 else
7185 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7186 saved_pt_byte + (coding->produced - bytes));
7187
7188 if (need_marker_adjustment)
7189 {
7190 struct Lisp_Marker *tail;
7191
7192 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7193 if (tail->need_adjustment)
7194 {
7195 tail->need_adjustment = 0;
7196 if (tail->insertion_type)
7197 {
7198 tail->bytepos = from_byte;
7199 tail->charpos = from;
7200 }
7201 else
7202 {
7203 tail->bytepos = from_byte + coding->produced;
7204 tail->charpos
7205 = (NILP (current_buffer->enable_multibyte_characters)
7206 ? tail->bytepos : from + coding->produced_char);
7207 }
7208 }
7209 }
7210 }
7211
7212 Vdeactivate_mark = old_deactivate_mark;
7213 unbind_to (count, coding->dst_object);
7214 }
7215
7216
7217 void
7218 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7219 dst_object)
7220 struct coding_system *coding;
7221 Lisp_Object src_object;
7222 EMACS_INT from, from_byte, to, to_byte;
7223 Lisp_Object dst_object;
7224 {
7225 int count = specpdl_ptr - specpdl;
7226 EMACS_INT chars = to - from;
7227 EMACS_INT bytes = to_byte - from_byte;
7228 Lisp_Object attrs;
7229 int saved_pt = -1, saved_pt_byte;
7230 int need_marker_adjustment = 0;
7231 int kill_src_buffer = 0;
7232 Lisp_Object old_deactivate_mark;
7233
7234 old_deactivate_mark = Vdeactivate_mark;
7235
7236 coding->src_object = src_object;
7237 coding->src_chars = chars;
7238 coding->src_bytes = bytes;
7239 coding->src_multibyte = chars < bytes;
7240
7241 attrs = CODING_ID_ATTRS (coding->id);
7242
7243 if (EQ (src_object, dst_object))
7244 {
7245 struct Lisp_Marker *tail;
7246
7247 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7248 {
7249 tail->need_adjustment
7250 = tail->charpos == (tail->insertion_type ? from : to);
7251 need_marker_adjustment |= tail->need_adjustment;
7252 }
7253 }
7254
7255 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7256 {
7257 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7258 set_buffer_internal (XBUFFER (coding->src_object));
7259 if (STRINGP (src_object))
7260 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7261 else if (BUFFERP (src_object))
7262 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7263 else
7264 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7265
7266 if (EQ (src_object, dst_object))
7267 {
7268 set_buffer_internal (XBUFFER (src_object));
7269 saved_pt = PT, saved_pt_byte = PT_BYTE;
7270 del_range_both (from, from_byte, to, to_byte, 1);
7271 set_buffer_internal (XBUFFER (coding->src_object));
7272 }
7273
7274 {
7275 Lisp_Object args[3];
7276 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7277
7278 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7279 old_deactivate_mark);
7280 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7281 args[1] = make_number (BEG);
7282 args[2] = make_number (Z);
7283 safe_call (3, args);
7284 UNGCPRO;
7285 }
7286 if (XBUFFER (coding->src_object) != current_buffer)
7287 kill_src_buffer = 1;
7288 coding->src_object = Fcurrent_buffer ();
7289 if (BEG != GPT)
7290 move_gap_both (BEG, BEG_BYTE);
7291 coding->src_chars = Z - BEG;
7292 coding->src_bytes = Z_BYTE - BEG_BYTE;
7293 coding->src_pos = BEG;
7294 coding->src_pos_byte = BEG_BYTE;
7295 coding->src_multibyte = Z < Z_BYTE;
7296 }
7297 else if (STRINGP (src_object))
7298 {
7299 code_conversion_save (0, 0);
7300 coding->src_pos = from;
7301 coding->src_pos_byte = from_byte;
7302 }
7303 else if (BUFFERP (src_object))
7304 {
7305 code_conversion_save (0, 0);
7306 set_buffer_internal (XBUFFER (src_object));
7307 if (EQ (src_object, dst_object))
7308 {
7309 saved_pt = PT, saved_pt_byte = PT_BYTE;
7310 coding->src_object = del_range_1 (from, to, 1, 1);
7311 coding->src_pos = 0;
7312 coding->src_pos_byte = 0;
7313 }
7314 else
7315 {
7316 if (from < GPT && to >= GPT)
7317 move_gap_both (from, from_byte);
7318 coding->src_pos = from;
7319 coding->src_pos_byte = from_byte;
7320 }
7321 }
7322 else
7323 code_conversion_save (0, 0);
7324
7325 if (BUFFERP (dst_object))
7326 {
7327 coding->dst_object = dst_object;
7328 if (EQ (src_object, dst_object))
7329 {
7330 coding->dst_pos = from;
7331 coding->dst_pos_byte = from_byte;
7332 }
7333 else
7334 {
7335 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7336 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7337 }
7338 coding->dst_multibyte
7339 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7340 }
7341 else if (EQ (dst_object, Qt))
7342 {
7343 coding->dst_object = Qnil;
7344 coding->dst_bytes = coding->src_chars;
7345 if (coding->dst_bytes == 0)
7346 coding->dst_bytes = 1;
7347 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7348 coding->dst_multibyte = 0;
7349 }
7350 else
7351 {
7352 coding->dst_object = Qnil;
7353 coding->dst_multibyte = 0;
7354 }
7355
7356 encode_coding (coding);
7357
7358 if (EQ (dst_object, Qt))
7359 {
7360 if (BUFFERP (coding->dst_object))
7361 coding->dst_object = Fbuffer_string ();
7362 else
7363 {
7364 coding->dst_object
7365 = make_unibyte_string ((char *) coding->destination,
7366 coding->produced);
7367 xfree (coding->destination);
7368 }
7369 }
7370
7371 if (saved_pt >= 0)
7372 {
7373 /* This is the case of:
7374 (BUFFERP (src_object) && EQ (src_object, dst_object))
7375 As we have moved PT while replacing the original buffer
7376 contents, we must recover it now. */
7377 set_buffer_internal (XBUFFER (src_object));
7378 if (saved_pt < from)
7379 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7380 else if (saved_pt < from + chars)
7381 TEMP_SET_PT_BOTH (from, from_byte);
7382 else if (! NILP (current_buffer->enable_multibyte_characters))
7383 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7384 saved_pt_byte + (coding->produced - bytes));
7385 else
7386 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7387 saved_pt_byte + (coding->produced - bytes));
7388
7389 if (need_marker_adjustment)
7390 {
7391 struct Lisp_Marker *tail;
7392
7393 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7394 if (tail->need_adjustment)
7395 {
7396 tail->need_adjustment = 0;
7397 if (tail->insertion_type)
7398 {
7399 tail->bytepos = from_byte;
7400 tail->charpos = from;
7401 }
7402 else
7403 {
7404 tail->bytepos = from_byte + coding->produced;
7405 tail->charpos
7406 = (NILP (current_buffer->enable_multibyte_characters)
7407 ? tail->bytepos : from + coding->produced_char);
7408 }
7409 }
7410 }
7411 }
7412
7413 if (kill_src_buffer)
7414 Fkill_buffer (coding->src_object);
7415
7416 Vdeactivate_mark = old_deactivate_mark;
7417 unbind_to (count, Qnil);
7418 }
7419
7420
7421 Lisp_Object
7422 preferred_coding_system ()
7423 {
7424 int id = coding_categories[coding_priorities[0]].id;
7425
7426 return CODING_ID_NAME (id);
7427 }
7428
7429 \f
7430 #ifdef emacs
7431 /*** 8. Emacs Lisp library functions ***/
7432
7433 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7434 doc: /* Return t if OBJECT is nil or a coding-system.
7435 See the documentation of `define-coding-system' for information
7436 about coding-system objects. */)
7437 (obj)
7438 Lisp_Object obj;
7439 {
7440 if (NILP (obj)
7441 || CODING_SYSTEM_ID (obj) >= 0)
7442 return Qt;
7443 if (! SYMBOLP (obj)
7444 || NILP (Fget (obj, Qcoding_system_define_form)))
7445 return Qnil;
7446 return Qt;
7447 }
7448
7449 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7450 Sread_non_nil_coding_system, 1, 1, 0,
7451 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7452 (prompt)
7453 Lisp_Object prompt;
7454 {
7455 Lisp_Object val;
7456 do
7457 {
7458 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7459 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7460 }
7461 while (SCHARS (val) == 0);
7462 return (Fintern (val, Qnil));
7463 }
7464
7465 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7466 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7467 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7468 Ignores case when completing coding systems (all Emacs coding systems
7469 are lower-case). */)
7470 (prompt, default_coding_system)
7471 Lisp_Object prompt, default_coding_system;
7472 {
7473 Lisp_Object val;
7474 int count = SPECPDL_INDEX ();
7475
7476 if (SYMBOLP (default_coding_system))
7477 default_coding_system = SYMBOL_NAME (default_coding_system);
7478 specbind (Qcompletion_ignore_case, Qt);
7479 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7480 Qt, Qnil, Qcoding_system_history,
7481 default_coding_system, Qnil);
7482 unbind_to (count, Qnil);
7483 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7484 }
7485
7486 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7487 1, 1, 0,
7488 doc: /* Check validity of CODING-SYSTEM.
7489 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7490 It is valid if it is nil or a symbol defined as a coding system by the
7491 function `define-coding-system'. */)
7492 (coding_system)
7493 Lisp_Object coding_system;
7494 {
7495 Lisp_Object define_form;
7496
7497 define_form = Fget (coding_system, Qcoding_system_define_form);
7498 if (! NILP (define_form))
7499 {
7500 Fput (coding_system, Qcoding_system_define_form, Qnil);
7501 safe_eval (define_form);
7502 }
7503 if (!NILP (Fcoding_system_p (coding_system)))
7504 return coding_system;
7505 xsignal1 (Qcoding_system_error, coding_system);
7506 }
7507
7508 \f
7509 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7510 HIGHEST is nonzero, return the coding system of the highest
7511 priority among the detected coding systems. Otherwize return a
7512 list of detected coding systems sorted by their priorities. If
7513 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7514 multibyte form but contains only ASCII and eight-bit chars.
7515 Otherwise, the bytes are raw bytes.
7516
7517 CODING-SYSTEM controls the detection as below:
7518
7519 If it is nil, detect both text-format and eol-format. If the
7520 text-format part of CODING-SYSTEM is already specified
7521 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7522 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7523 detect only text-format. */
7524
7525 Lisp_Object
7526 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7527 coding_system)
7528 const unsigned char *src;
7529 EMACS_INT src_chars, src_bytes;
7530 int highest;
7531 int multibytep;
7532 Lisp_Object coding_system;
7533 {
7534 const unsigned char *src_end = src + src_bytes;
7535 Lisp_Object attrs, eol_type;
7536 Lisp_Object val;
7537 struct coding_system coding;
7538 int id;
7539 struct coding_detection_info detect_info;
7540 enum coding_category base_category;
7541 int null_byte_found = 0, eight_bit_found = 0;
7542
7543 if (NILP (coding_system))
7544 coding_system = Qundecided;
7545 setup_coding_system (coding_system, &coding);
7546 attrs = CODING_ID_ATTRS (coding.id);
7547 eol_type = CODING_ID_EOL_TYPE (coding.id);
7548 coding_system = CODING_ATTR_BASE_NAME (attrs);
7549
7550 coding.source = src;
7551 coding.src_chars = src_chars;
7552 coding.src_bytes = src_bytes;
7553 coding.src_multibyte = multibytep;
7554 coding.consumed = 0;
7555 coding.mode |= CODING_MODE_LAST_BLOCK;
7556
7557 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7558
7559 /* At first, detect text-format if necessary. */
7560 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7561 if (base_category == coding_category_undecided)
7562 {
7563 enum coding_category category;
7564 struct coding_system *this;
7565 int c, i;
7566
7567 coding.head_ascii = -1;
7568 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7569 for (; src < src_end; src++)
7570 {
7571 c = *src;
7572 if (c & 0x80)
7573 {
7574 eight_bit_found = 1;
7575 if (coding.head_ascii < 0)
7576 coding.head_ascii = src - coding.source;
7577 if (null_byte_found)
7578 break;
7579 }
7580 if (c < 0x20)
7581 {
7582 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7583 && ! inhibit_iso_escape_detection
7584 && ! detect_info.checked)
7585 {
7586 if (coding.head_ascii < 0)
7587 coding.head_ascii = src - coding.source;
7588 if (detect_coding_iso_2022 (&coding, &detect_info))
7589 {
7590 /* We have scanned the whole data. */
7591 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7592 /* We didn't find an 8-bit code. We may have
7593 found a null-byte, but it's very rare that
7594 a binary file confirm to ISO-2022. */
7595 src = src_end;
7596 break;
7597 }
7598 }
7599 else if (! c)
7600 {
7601 null_byte_found = 1;
7602 if (eight_bit_found)
7603 break;
7604 }
7605 }
7606 }
7607 if (coding.head_ascii < 0)
7608 coding.head_ascii = src - coding.source;
7609
7610 if (null_byte_found || eight_bit_found
7611 || coding.head_ascii < coding.src_bytes
7612 || detect_info.found)
7613 {
7614 if (coding.head_ascii == coding.src_bytes)
7615 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7616 for (i = 0; i < coding_category_raw_text; i++)
7617 {
7618 category = coding_priorities[i];
7619 this = coding_categories + category;
7620 if (detect_info.found & (1 << category))
7621 break;
7622 }
7623 else
7624 {
7625 if (null_byte_found)
7626 {
7627 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7628 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7629 }
7630 for (i = 0; i < coding_category_raw_text; i++)
7631 {
7632 category = coding_priorities[i];
7633 this = coding_categories + category;
7634
7635 if (this->id < 0)
7636 {
7637 /* No coding system of this category is defined. */
7638 detect_info.rejected |= (1 << category);
7639 }
7640 else if (category >= coding_category_raw_text)
7641 continue;
7642 else if (detect_info.checked & (1 << category))
7643 {
7644 if (highest
7645 && (detect_info.found & (1 << category)))
7646 break;
7647 }
7648 else if ((*(this->detector)) (&coding, &detect_info)
7649 && highest
7650 && (detect_info.found & (1 << category)))
7651 {
7652 if (category == coding_category_utf_16_auto)
7653 {
7654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7655 category = coding_category_utf_16_le;
7656 else
7657 category = coding_category_utf_16_be;
7658 }
7659 break;
7660 }
7661 }
7662 }
7663 }
7664
7665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7666 {
7667 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7668 id = coding_categories[coding_category_raw_text].id;
7669 val = Fcons (make_number (id), Qnil);
7670 }
7671 else if (! detect_info.rejected && ! detect_info.found)
7672 {
7673 detect_info.found = CATEGORY_MASK_ANY;
7674 id = coding_categories[coding_category_undecided].id;
7675 val = Fcons (make_number (id), Qnil);
7676 }
7677 else if (highest)
7678 {
7679 if (detect_info.found)
7680 {
7681 detect_info.found = 1 << category;
7682 val = Fcons (make_number (this->id), Qnil);
7683 }
7684 else
7685 for (i = 0; i < coding_category_raw_text; i++)
7686 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7687 {
7688 detect_info.found = 1 << coding_priorities[i];
7689 id = coding_categories[coding_priorities[i]].id;
7690 val = Fcons (make_number (id), Qnil);
7691 break;
7692 }
7693 }
7694 else
7695 {
7696 int mask = detect_info.rejected | detect_info.found;
7697 int found = 0;
7698 val = Qnil;
7699
7700 for (i = coding_category_raw_text - 1; i >= 0; i--)
7701 {
7702 category = coding_priorities[i];
7703 if (! (mask & (1 << category)))
7704 {
7705 found |= 1 << category;
7706 id = coding_categories[category].id;
7707 if (id >= 0)
7708 val = Fcons (make_number (id), val);
7709 }
7710 }
7711 for (i = coding_category_raw_text - 1; i >= 0; i--)
7712 {
7713 category = coding_priorities[i];
7714 if (detect_info.found & (1 << category))
7715 {
7716 id = coding_categories[category].id;
7717 val = Fcons (make_number (id), val);
7718 }
7719 }
7720 detect_info.found |= found;
7721 }
7722 }
7723 else if (base_category == coding_category_utf_16_auto)
7724 {
7725 if (detect_coding_utf_16 (&coding, &detect_info))
7726 {
7727 struct coding_system *this;
7728
7729 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7730 this = coding_categories + coding_category_utf_16_le;
7731 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7732 this = coding_categories + coding_category_utf_16_be;
7733 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7734 this = coding_categories + coding_category_utf_16_be_nosig;
7735 else
7736 this = coding_categories + coding_category_utf_16_le_nosig;
7737 val = Fcons (make_number (this->id), Qnil);
7738 }
7739 }
7740 else
7741 {
7742 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7743 val = Fcons (make_number (coding.id), Qnil);
7744 }
7745
7746 /* Then, detect eol-format if necessary. */
7747 {
7748 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7749 Lisp_Object tail;
7750
7751 if (VECTORP (eol_type))
7752 {
7753 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7754 {
7755 if (null_byte_found)
7756 normal_eol = EOL_SEEN_LF;
7757 else
7758 normal_eol = detect_eol (coding.source, src_bytes,
7759 coding_category_raw_text);
7760 }
7761 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7762 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7763 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7764 coding_category_utf_16_be);
7765 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7766 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7767 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7768 coding_category_utf_16_le);
7769 }
7770 else
7771 {
7772 if (EQ (eol_type, Qunix))
7773 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7774 else if (EQ (eol_type, Qdos))
7775 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7776 else
7777 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7778 }
7779
7780 for (tail = val; CONSP (tail); tail = XCDR (tail))
7781 {
7782 enum coding_category category;
7783 int this_eol;
7784
7785 id = XINT (XCAR (tail));
7786 attrs = CODING_ID_ATTRS (id);
7787 category = XINT (CODING_ATTR_CATEGORY (attrs));
7788 eol_type = CODING_ID_EOL_TYPE (id);
7789 if (VECTORP (eol_type))
7790 {
7791 if (category == coding_category_utf_16_be
7792 || category == coding_category_utf_16_be_nosig)
7793 this_eol = utf_16_be_eol;
7794 else if (category == coding_category_utf_16_le
7795 || category == coding_category_utf_16_le_nosig)
7796 this_eol = utf_16_le_eol;
7797 else
7798 this_eol = normal_eol;
7799
7800 if (this_eol == EOL_SEEN_LF)
7801 XSETCAR (tail, AREF (eol_type, 0));
7802 else if (this_eol == EOL_SEEN_CRLF)
7803 XSETCAR (tail, AREF (eol_type, 1));
7804 else if (this_eol == EOL_SEEN_CR)
7805 XSETCAR (tail, AREF (eol_type, 2));
7806 else
7807 XSETCAR (tail, CODING_ID_NAME (id));
7808 }
7809 else
7810 XSETCAR (tail, CODING_ID_NAME (id));
7811 }
7812 }
7813
7814 return (highest ? XCAR (val) : val);
7815 }
7816
7817
7818 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7819 2, 3, 0,
7820 doc: /* Detect coding system of the text in the region between START and END.
7821 Return a list of possible coding systems ordered by priority.
7822
7823 If only ASCII characters are found (except for such ISO-2022 control
7824 characters ISO-2022 as ESC), it returns a list of single element
7825 `undecided' or its subsidiary coding system according to a detected
7826 end-of-line format.
7827
7828 If optional argument HIGHEST is non-nil, return the coding system of
7829 highest priority. */)
7830 (start, end, highest)
7831 Lisp_Object start, end, highest;
7832 {
7833 int from, to;
7834 int from_byte, to_byte;
7835
7836 CHECK_NUMBER_COERCE_MARKER (start);
7837 CHECK_NUMBER_COERCE_MARKER (end);
7838
7839 validate_region (&start, &end);
7840 from = XINT (start), to = XINT (end);
7841 from_byte = CHAR_TO_BYTE (from);
7842 to_byte = CHAR_TO_BYTE (to);
7843
7844 if (from < GPT && to >= GPT)
7845 move_gap_both (to, to_byte);
7846
7847 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7848 to - from, to_byte - from_byte,
7849 !NILP (highest),
7850 !NILP (current_buffer
7851 ->enable_multibyte_characters),
7852 Qnil);
7853 }
7854
7855 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7856 1, 2, 0,
7857 doc: /* Detect coding system of the text in STRING.
7858 Return a list of possible coding systems ordered by priority.
7859
7860 If only ASCII characters are found (except for such ISO-2022 control
7861 characters ISO-2022 as ESC), it returns a list of single element
7862 `undecided' or its subsidiary coding system according to a detected
7863 end-of-line format.
7864
7865 If optional argument HIGHEST is non-nil, return the coding system of
7866 highest priority. */)
7867 (string, highest)
7868 Lisp_Object string, highest;
7869 {
7870 CHECK_STRING (string);
7871
7872 return detect_coding_system (SDATA (string),
7873 SCHARS (string), SBYTES (string),
7874 !NILP (highest), STRING_MULTIBYTE (string),
7875 Qnil);
7876 }
7877
7878
7879 static INLINE int
7880 char_encodable_p (c, attrs)
7881 int c;
7882 Lisp_Object attrs;
7883 {
7884 Lisp_Object tail;
7885 struct charset *charset;
7886 Lisp_Object translation_table;
7887
7888 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7889 if (! NILP (translation_table))
7890 c = translate_char (translation_table, c);
7891 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7892 CONSP (tail); tail = XCDR (tail))
7893 {
7894 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7895 if (CHAR_CHARSET_P (c, charset))
7896 break;
7897 }
7898 return (! NILP (tail));
7899 }
7900
7901
7902 /* Return a list of coding systems that safely encode the text between
7903 START and END. If EXCLUDE is non-nil, it is a list of coding
7904 systems not to check. The returned list doesn't contain any such
7905 coding systems. In any case, if the text contains only ASCII or is
7906 unibyte, return t. */
7907
7908 DEFUN ("find-coding-systems-region-internal",
7909 Ffind_coding_systems_region_internal,
7910 Sfind_coding_systems_region_internal, 2, 3, 0,
7911 doc: /* Internal use only. */)
7912 (start, end, exclude)
7913 Lisp_Object start, end, exclude;
7914 {
7915 Lisp_Object coding_attrs_list, safe_codings;
7916 EMACS_INT start_byte, end_byte;
7917 const unsigned char *p, *pbeg, *pend;
7918 int c;
7919 Lisp_Object tail, elt;
7920
7921 if (STRINGP (start))
7922 {
7923 if (!STRING_MULTIBYTE (start)
7924 || SCHARS (start) == SBYTES (start))
7925 return Qt;
7926 start_byte = 0;
7927 end_byte = SBYTES (start);
7928 }
7929 else
7930 {
7931 CHECK_NUMBER_COERCE_MARKER (start);
7932 CHECK_NUMBER_COERCE_MARKER (end);
7933 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7934 args_out_of_range (start, end);
7935 if (NILP (current_buffer->enable_multibyte_characters))
7936 return Qt;
7937 start_byte = CHAR_TO_BYTE (XINT (start));
7938 end_byte = CHAR_TO_BYTE (XINT (end));
7939 if (XINT (end) - XINT (start) == end_byte - start_byte)
7940 return Qt;
7941
7942 if (XINT (start) < GPT && XINT (end) > GPT)
7943 {
7944 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7945 move_gap_both (XINT (start), start_byte);
7946 else
7947 move_gap_both (XINT (end), end_byte);
7948 }
7949 }
7950
7951 coding_attrs_list = Qnil;
7952 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7953 if (NILP (exclude)
7954 || NILP (Fmemq (XCAR (tail), exclude)))
7955 {
7956 Lisp_Object attrs;
7957
7958 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7959 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7960 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7961 {
7962 ASET (attrs, coding_attr_trans_tbl,
7963 get_translation_table (attrs, 1, NULL));
7964 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7965 }
7966 }
7967
7968 if (STRINGP (start))
7969 p = pbeg = SDATA (start);
7970 else
7971 p = pbeg = BYTE_POS_ADDR (start_byte);
7972 pend = p + (end_byte - start_byte);
7973
7974 while (p < pend && ASCII_BYTE_P (*p)) p++;
7975 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7976
7977 while (p < pend)
7978 {
7979 if (ASCII_BYTE_P (*p))
7980 p++;
7981 else
7982 {
7983 c = STRING_CHAR_ADVANCE (p);
7984
7985 charset_map_loaded = 0;
7986 for (tail = coding_attrs_list; CONSP (tail);)
7987 {
7988 elt = XCAR (tail);
7989 if (NILP (elt))
7990 tail = XCDR (tail);
7991 else if (char_encodable_p (c, elt))
7992 tail = XCDR (tail);
7993 else if (CONSP (XCDR (tail)))
7994 {
7995 XSETCAR (tail, XCAR (XCDR (tail)));
7996 XSETCDR (tail, XCDR (XCDR (tail)));
7997 }
7998 else
7999 {
8000 XSETCAR (tail, Qnil);
8001 tail = XCDR (tail);
8002 }
8003 }
8004 if (charset_map_loaded)
8005 {
8006 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8007
8008 if (STRINGP (start))
8009 pbeg = SDATA (start);
8010 else
8011 pbeg = BYTE_POS_ADDR (start_byte);
8012 p = pbeg + p_offset;
8013 pend = pbeg + pend_offset;
8014 }
8015 }
8016 }
8017
8018 safe_codings = list2 (Qraw_text, Qno_conversion);
8019 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8020 if (! NILP (XCAR (tail)))
8021 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8022
8023 return safe_codings;
8024 }
8025
8026
8027 DEFUN ("unencodable-char-position", Funencodable_char_position,
8028 Sunencodable_char_position, 3, 5, 0,
8029 doc: /*
8030 Return position of first un-encodable character in a region.
8031 START and END specfiy the region and CODING-SYSTEM specifies the
8032 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8033
8034 If optional 4th argument COUNT is non-nil, it specifies at most how
8035 many un-encodable characters to search. In this case, the value is a
8036 list of positions.
8037
8038 If optional 5th argument STRING is non-nil, it is a string to search
8039 for un-encodable characters. In that case, START and END are indexes
8040 to the string. */)
8041 (start, end, coding_system, count, string)
8042 Lisp_Object start, end, coding_system, count, string;
8043 {
8044 int n;
8045 struct coding_system coding;
8046 Lisp_Object attrs, charset_list, translation_table;
8047 Lisp_Object positions;
8048 int from, to;
8049 const unsigned char *p, *stop, *pend;
8050 int ascii_compatible;
8051
8052 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8053 attrs = CODING_ID_ATTRS (coding.id);
8054 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8055 return Qnil;
8056 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8057 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8058 translation_table = get_translation_table (attrs, 1, NULL);
8059
8060 if (NILP (string))
8061 {
8062 validate_region (&start, &end);
8063 from = XINT (start);
8064 to = XINT (end);
8065 if (NILP (current_buffer->enable_multibyte_characters)
8066 || (ascii_compatible
8067 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8068 return Qnil;
8069 p = CHAR_POS_ADDR (from);
8070 pend = CHAR_POS_ADDR (to);
8071 if (from < GPT && to >= GPT)
8072 stop = GPT_ADDR;
8073 else
8074 stop = pend;
8075 }
8076 else
8077 {
8078 CHECK_STRING (string);
8079 CHECK_NATNUM (start);
8080 CHECK_NATNUM (end);
8081 from = XINT (start);
8082 to = XINT (end);
8083 if (from > to
8084 || to > SCHARS (string))
8085 args_out_of_range_3 (string, start, end);
8086 if (! STRING_MULTIBYTE (string))
8087 return Qnil;
8088 p = SDATA (string) + string_char_to_byte (string, from);
8089 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8090 if (ascii_compatible && (to - from) == (pend - p))
8091 return Qnil;
8092 }
8093
8094 if (NILP (count))
8095 n = 1;
8096 else
8097 {
8098 CHECK_NATNUM (count);
8099 n = XINT (count);
8100 }
8101
8102 positions = Qnil;
8103 while (1)
8104 {
8105 int c;
8106
8107 if (ascii_compatible)
8108 while (p < stop && ASCII_BYTE_P (*p))
8109 p++, from++;
8110 if (p >= stop)
8111 {
8112 if (p >= pend)
8113 break;
8114 stop = pend;
8115 p = GAP_END_ADDR;
8116 }
8117
8118 c = STRING_CHAR_ADVANCE (p);
8119 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8120 && ! char_charset (translate_char (translation_table, c),
8121 charset_list, NULL))
8122 {
8123 positions = Fcons (make_number (from), positions);
8124 n--;
8125 if (n == 0)
8126 break;
8127 }
8128
8129 from++;
8130 }
8131
8132 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8133 }
8134
8135
8136 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8137 Scheck_coding_systems_region, 3, 3, 0,
8138 doc: /* Check if the region is encodable by coding systems.
8139
8140 START and END are buffer positions specifying the region.
8141 CODING-SYSTEM-LIST is a list of coding systems to check.
8142
8143 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8144 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
8145 whole region, POS0, POS1, ... are buffer positions where non-encodable
8146 characters are found.
8147
8148 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8149 value is nil.
8150
8151 START may be a string. In that case, check if the string is
8152 encodable, and the value contains indices to the string instead of
8153 buffer positions. END is ignored. */)
8154 (start, end, coding_system_list)
8155 Lisp_Object start, end, coding_system_list;
8156 {
8157 Lisp_Object list;
8158 EMACS_INT start_byte, end_byte;
8159 int pos;
8160 const unsigned char *p, *pbeg, *pend;
8161 int c;
8162 Lisp_Object tail, elt, attrs;
8163
8164 if (STRINGP (start))
8165 {
8166 if (!STRING_MULTIBYTE (start)
8167 && SCHARS (start) != SBYTES (start))
8168 return Qnil;
8169 start_byte = 0;
8170 end_byte = SBYTES (start);
8171 pos = 0;
8172 }
8173 else
8174 {
8175 CHECK_NUMBER_COERCE_MARKER (start);
8176 CHECK_NUMBER_COERCE_MARKER (end);
8177 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8178 args_out_of_range (start, end);
8179 if (NILP (current_buffer->enable_multibyte_characters))
8180 return Qnil;
8181 start_byte = CHAR_TO_BYTE (XINT (start));
8182 end_byte = CHAR_TO_BYTE (XINT (end));
8183 if (XINT (end) - XINT (start) == end_byte - start_byte)
8184 return Qt;
8185
8186 if (XINT (start) < GPT && XINT (end) > GPT)
8187 {
8188 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8189 move_gap_both (XINT (start), start_byte);
8190 else
8191 move_gap_both (XINT (end), end_byte);
8192 }
8193 pos = XINT (start);
8194 }
8195
8196 list = Qnil;
8197 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8198 {
8199 elt = XCAR (tail);
8200 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8201 ASET (attrs, coding_attr_trans_tbl,
8202 get_translation_table (attrs, 1, NULL));
8203 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8204 }
8205
8206 if (STRINGP (start))
8207 p = pbeg = SDATA (start);
8208 else
8209 p = pbeg = BYTE_POS_ADDR (start_byte);
8210 pend = p + (end_byte - start_byte);
8211
8212 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8213 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8214
8215 while (p < pend)
8216 {
8217 if (ASCII_BYTE_P (*p))
8218 p++;
8219 else
8220 {
8221 c = STRING_CHAR_ADVANCE (p);
8222
8223 charset_map_loaded = 0;
8224 for (tail = list; CONSP (tail); tail = XCDR (tail))
8225 {
8226 elt = XCDR (XCAR (tail));
8227 if (! char_encodable_p (c, XCAR (elt)))
8228 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8229 }
8230 if (charset_map_loaded)
8231 {
8232 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8233
8234 if (STRINGP (start))
8235 pbeg = SDATA (start);
8236 else
8237 pbeg = BYTE_POS_ADDR (start_byte);
8238 p = pbeg + p_offset;
8239 pend = pbeg + pend_offset;
8240 }
8241 }
8242 pos++;
8243 }
8244
8245 tail = list;
8246 list = Qnil;
8247 for (; CONSP (tail); tail = XCDR (tail))
8248 {
8249 elt = XCAR (tail);
8250 if (CONSP (XCDR (XCDR (elt))))
8251 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8252 list);
8253 }
8254
8255 return list;
8256 }
8257
8258
8259 Lisp_Object
8260 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8261 Lisp_Object start, end, coding_system, dst_object;
8262 int encodep, norecord;
8263 {
8264 struct coding_system coding;
8265 EMACS_INT from, from_byte, to, to_byte;
8266 Lisp_Object src_object;
8267
8268 CHECK_NUMBER_COERCE_MARKER (start);
8269 CHECK_NUMBER_COERCE_MARKER (end);
8270 if (NILP (coding_system))
8271 coding_system = Qno_conversion;
8272 else
8273 CHECK_CODING_SYSTEM (coding_system);
8274 src_object = Fcurrent_buffer ();
8275 if (NILP (dst_object))
8276 dst_object = src_object;
8277 else if (! EQ (dst_object, Qt))
8278 CHECK_BUFFER (dst_object);
8279
8280 validate_region (&start, &end);
8281 from = XFASTINT (start);
8282 from_byte = CHAR_TO_BYTE (from);
8283 to = XFASTINT (end);
8284 to_byte = CHAR_TO_BYTE (to);
8285
8286 setup_coding_system (coding_system, &coding);
8287 coding.mode |= CODING_MODE_LAST_BLOCK;
8288
8289 if (encodep)
8290 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8291 dst_object);
8292 else
8293 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8294 dst_object);
8295 if (! norecord)
8296 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8297
8298 return (BUFFERP (dst_object)
8299 ? make_number (coding.produced_char)
8300 : coding.dst_object);
8301 }
8302
8303
8304 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8305 3, 4, "r\nzCoding system: ",
8306 doc: /* Decode the current region from the specified coding system.
8307 When called from a program, takes four arguments:
8308 START, END, CODING-SYSTEM, and DESTINATION.
8309 START and END are buffer positions.
8310
8311 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8312 If nil, the region between START and END is replaced by the decoded text.
8313 If buffer, the decoded text is inserted in the buffer.
8314 If t, the decoded text is returned.
8315
8316 This function sets `last-coding-system-used' to the precise coding system
8317 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8318 not fully specified.)
8319 It returns the length of the decoded text. */)
8320 (start, end, coding_system, destination)
8321 Lisp_Object start, end, coding_system, destination;
8322 {
8323 return code_convert_region (start, end, coding_system, destination, 0, 0);
8324 }
8325
8326 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8327 3, 4, "r\nzCoding system: ",
8328 doc: /* Encode the current region by specified coding system.
8329 When called from a program, takes three arguments:
8330 START, END, and CODING-SYSTEM. START and END are buffer positions.
8331
8332 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8333 If nil, the region between START and END is replace by the encoded text.
8334 If buffer, the encoded text is inserted in the buffer.
8335 If t, the encoded text is returned.
8336
8337 This function sets `last-coding-system-used' to the precise coding system
8338 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8339 not fully specified.)
8340 It returns the length of the encoded text. */)
8341 (start, end, coding_system, destination)
8342 Lisp_Object start, end, coding_system, destination;
8343 {
8344 return code_convert_region (start, end, coding_system, destination, 1, 0);
8345 }
8346
8347 Lisp_Object
8348 code_convert_string (string, coding_system, dst_object,
8349 encodep, nocopy, norecord)
8350 Lisp_Object string, coding_system, dst_object;
8351 int encodep, nocopy, norecord;
8352 {
8353 struct coding_system coding;
8354 EMACS_INT chars, bytes;
8355
8356 CHECK_STRING (string);
8357 if (NILP (coding_system))
8358 {
8359 if (! norecord)
8360 Vlast_coding_system_used = Qno_conversion;
8361 if (NILP (dst_object))
8362 return (nocopy ? Fcopy_sequence (string) : string);
8363 }
8364
8365 if (NILP (coding_system))
8366 coding_system = Qno_conversion;
8367 else
8368 CHECK_CODING_SYSTEM (coding_system);
8369 if (NILP (dst_object))
8370 dst_object = Qt;
8371 else if (! EQ (dst_object, Qt))
8372 CHECK_BUFFER (dst_object);
8373
8374 setup_coding_system (coding_system, &coding);
8375 coding.mode |= CODING_MODE_LAST_BLOCK;
8376 chars = SCHARS (string);
8377 bytes = SBYTES (string);
8378 if (encodep)
8379 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8380 else
8381 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8382 if (! norecord)
8383 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8384
8385 return (BUFFERP (dst_object)
8386 ? make_number (coding.produced_char)
8387 : coding.dst_object);
8388 }
8389
8390
8391 /* Encode or decode STRING according to CODING_SYSTEM.
8392 Do not set Vlast_coding_system_used.
8393
8394 This function is called only from macros DECODE_FILE and
8395 ENCODE_FILE, thus we ignore character composition. */
8396
8397 Lisp_Object
8398 code_convert_string_norecord (string, coding_system, encodep)
8399 Lisp_Object string, coding_system;
8400 int encodep;
8401 {
8402 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8403 }
8404
8405
8406 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8407 2, 4, 0,
8408 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8409
8410 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8411 if the decoding operation is trivial.
8412
8413 Optional fourth arg BUFFER non-nil meant that the decoded text is
8414 inserted in BUFFER instead of returned as a string. In this case,
8415 the return value is BUFFER.
8416
8417 This function sets `last-coding-system-used' to the precise coding system
8418 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8419 not fully specified. */)
8420 (string, coding_system, nocopy, buffer)
8421 Lisp_Object string, coding_system, nocopy, buffer;
8422 {
8423 return code_convert_string (string, coding_system, buffer,
8424 0, ! NILP (nocopy), 0);
8425 }
8426
8427 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8428 2, 4, 0,
8429 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8430
8431 Optional third arg NOCOPY non-nil means it is OK to return STRING
8432 itself if the encoding operation is trivial.
8433
8434 Optional fourth arg BUFFER non-nil meant that the encoded text is
8435 inserted in BUFFER instead of returned as a string. In this case,
8436 the return value is BUFFER.
8437
8438 This function sets `last-coding-system-used' to the precise coding system
8439 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8440 not fully specified.) */)
8441 (string, coding_system, nocopy, buffer)
8442 Lisp_Object string, coding_system, nocopy, buffer;
8443 {
8444 return code_convert_string (string, coding_system, buffer,
8445 1, ! NILP (nocopy), 1);
8446 }
8447
8448 \f
8449 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8450 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8451 Return the corresponding character. */)
8452 (code)
8453 Lisp_Object code;
8454 {
8455 Lisp_Object spec, attrs, val;
8456 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8457 int c;
8458
8459 CHECK_NATNUM (code);
8460 c = XFASTINT (code);
8461 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8462 attrs = AREF (spec, 0);
8463
8464 if (ASCII_BYTE_P (c)
8465 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8466 return code;
8467
8468 val = CODING_ATTR_CHARSET_LIST (attrs);
8469 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8470 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8471 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8472
8473 if (c <= 0x7F)
8474 charset = charset_roman;
8475 else if (c >= 0xA0 && c < 0xDF)
8476 {
8477 charset = charset_kana;
8478 c -= 0x80;
8479 }
8480 else
8481 {
8482 int s1 = c >> 8, s2 = c & 0xFF;
8483
8484 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8485 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8486 error ("Invalid code: %d", code);
8487 SJIS_TO_JIS (c);
8488 charset = charset_kanji;
8489 }
8490 c = DECODE_CHAR (charset, c);
8491 if (c < 0)
8492 error ("Invalid code: %d", code);
8493 return make_number (c);
8494 }
8495
8496
8497 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8498 doc: /* Encode a Japanese character CH to shift_jis encoding.
8499 Return the corresponding code in SJIS. */)
8500 (ch)
8501 Lisp_Object ch;
8502 {
8503 Lisp_Object spec, attrs, charset_list;
8504 int c;
8505 struct charset *charset;
8506 unsigned code;
8507
8508 CHECK_CHARACTER (ch);
8509 c = XFASTINT (ch);
8510 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8511 attrs = AREF (spec, 0);
8512
8513 if (ASCII_CHAR_P (c)
8514 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8515 return ch;
8516
8517 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8518 charset = char_charset (c, charset_list, &code);
8519 if (code == CHARSET_INVALID_CODE (charset))
8520 error ("Can't encode by shift_jis encoding: %d", c);
8521 JIS_TO_SJIS (code);
8522
8523 return make_number (code);
8524 }
8525
8526 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8527 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8528 Return the corresponding character. */)
8529 (code)
8530 Lisp_Object code;
8531 {
8532 Lisp_Object spec, attrs, val;
8533 struct charset *charset_roman, *charset_big5, *charset;
8534 int c;
8535
8536 CHECK_NATNUM (code);
8537 c = XFASTINT (code);
8538 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8539 attrs = AREF (spec, 0);
8540
8541 if (ASCII_BYTE_P (c)
8542 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8543 return code;
8544
8545 val = CODING_ATTR_CHARSET_LIST (attrs);
8546 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8547 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8548
8549 if (c <= 0x7F)
8550 charset = charset_roman;
8551 else
8552 {
8553 int b1 = c >> 8, b2 = c & 0x7F;
8554 if (b1 < 0xA1 || b1 > 0xFE
8555 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8556 error ("Invalid code: %d", code);
8557 charset = charset_big5;
8558 }
8559 c = DECODE_CHAR (charset, (unsigned )c);
8560 if (c < 0)
8561 error ("Invalid code: %d", code);
8562 return make_number (c);
8563 }
8564
8565 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8566 doc: /* Encode the Big5 character CH to BIG5 coding system.
8567 Return the corresponding character code in Big5. */)
8568 (ch)
8569 Lisp_Object ch;
8570 {
8571 Lisp_Object spec, attrs, charset_list;
8572 struct charset *charset;
8573 int c;
8574 unsigned code;
8575
8576 CHECK_CHARACTER (ch);
8577 c = XFASTINT (ch);
8578 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8579 attrs = AREF (spec, 0);
8580 if (ASCII_CHAR_P (c)
8581 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8582 return ch;
8583
8584 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8585 charset = char_charset (c, charset_list, &code);
8586 if (code == CHARSET_INVALID_CODE (charset))
8587 error ("Can't encode by Big5 encoding: %d", c);
8588
8589 return make_number (code);
8590 }
8591
8592 \f
8593 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8594 Sset_terminal_coding_system_internal, 1, 2, 0,
8595 doc: /* Internal use only. */)
8596 (coding_system, terminal)
8597 Lisp_Object coding_system;
8598 Lisp_Object terminal;
8599 {
8600 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8601 CHECK_SYMBOL (coding_system);
8602 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8603 /* We had better not send unsafe characters to terminal. */
8604 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8605 /* Characer composition should be disabled. */
8606 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8607 terminal_coding->src_multibyte = 1;
8608 terminal_coding->dst_multibyte = 0;
8609 return Qnil;
8610 }
8611
8612 DEFUN ("set-safe-terminal-coding-system-internal",
8613 Fset_safe_terminal_coding_system_internal,
8614 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8615 doc: /* Internal use only. */)
8616 (coding_system)
8617 Lisp_Object coding_system;
8618 {
8619 CHECK_SYMBOL (coding_system);
8620 setup_coding_system (Fcheck_coding_system (coding_system),
8621 &safe_terminal_coding);
8622 /* Characer composition should be disabled. */
8623 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8624 safe_terminal_coding.src_multibyte = 1;
8625 safe_terminal_coding.dst_multibyte = 0;
8626 return Qnil;
8627 }
8628
8629 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8630 Sterminal_coding_system, 0, 1, 0,
8631 doc: /* Return coding system specified for terminal output on the given terminal.
8632 TERMINAL may be a terminal id, a frame, or nil for the selected
8633 frame's terminal device. */)
8634 (terminal)
8635 Lisp_Object terminal;
8636 {
8637 struct coding_system *terminal_coding
8638 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8639 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8640
8641 /* For backward compatibility, return nil if it is `undecided'. */
8642 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8643 }
8644
8645 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8646 Sset_keyboard_coding_system_internal, 1, 2, 0,
8647 doc: /* Internal use only. */)
8648 (coding_system, terminal)
8649 Lisp_Object coding_system;
8650 Lisp_Object terminal;
8651 {
8652 struct terminal *t = get_terminal (terminal, 1);
8653 CHECK_SYMBOL (coding_system);
8654 setup_coding_system (Fcheck_coding_system (coding_system),
8655 TERMINAL_KEYBOARD_CODING (t));
8656 /* Characer composition should be disabled. */
8657 TERMINAL_KEYBOARD_CODING (t)->common_flags
8658 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8659 return Qnil;
8660 }
8661
8662 DEFUN ("keyboard-coding-system",
8663 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8664 doc: /* Return coding system specified for decoding keyboard input. */)
8665 (terminal)
8666 Lisp_Object terminal;
8667 {
8668 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8669 (get_terminal (terminal, 1))->id);
8670 }
8671
8672 \f
8673 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8674 Sfind_operation_coding_system, 1, MANY, 0,
8675 doc: /* Choose a coding system for an operation based on the target name.
8676 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8677 DECODING-SYSTEM is the coding system to use for decoding
8678 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8679 for encoding (in case OPERATION does encoding).
8680
8681 The first argument OPERATION specifies an I/O primitive:
8682 For file I/O, `insert-file-contents' or `write-region'.
8683 For process I/O, `call-process', `call-process-region', or `start-process'.
8684 For network I/O, `open-network-stream'.
8685
8686 The remaining arguments should be the same arguments that were passed
8687 to the primitive. Depending on which primitive, one of those arguments
8688 is selected as the TARGET. For example, if OPERATION does file I/O,
8689 whichever argument specifies the file name is TARGET.
8690
8691 TARGET has a meaning which depends on OPERATION:
8692 For file I/O, TARGET is a file name (except for the special case below).
8693 For process I/O, TARGET is a process name.
8694 For network I/O, TARGET is a service name or a port number
8695
8696 This function looks up what specified for TARGET in,
8697 `file-coding-system-alist', `process-coding-system-alist',
8698 or `network-coding-system-alist' depending on OPERATION.
8699 They may specify a coding system, a cons of coding systems,
8700 or a function symbol to call.
8701 In the last case, we call the function with one argument,
8702 which is a list of all the arguments given to this function.
8703 If the function can't decide a coding system, it can return
8704 `undecided' so that the normal code-detection is performed.
8705
8706 If OPERATION is `insert-file-contents', the argument corresponding to
8707 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8708 file name to look up, and BUFFER is a buffer that contains the file's
8709 contents (not yet decoded). If `file-coding-system-alist' specifies a
8710 function to call for FILENAME, that function should examine the
8711 contents of BUFFER instead of reading the file.
8712
8713 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
8714 (nargs, args)
8715 int nargs;
8716 Lisp_Object *args;
8717 {
8718 Lisp_Object operation, target_idx, target, val;
8719 register Lisp_Object chain;
8720
8721 if (nargs < 2)
8722 error ("Too few arguments");
8723 operation = args[0];
8724 if (!SYMBOLP (operation)
8725 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8726 error ("Invalid first argument");
8727 if (nargs < 1 + XINT (target_idx))
8728 error ("Too few arguments for operation: %s",
8729 SDATA (SYMBOL_NAME (operation)));
8730 target = args[XINT (target_idx) + 1];
8731 if (!(STRINGP (target)
8732 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8733 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8734 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8735 error ("Invalid %dth argument", XINT (target_idx) + 1);
8736 if (CONSP (target))
8737 target = XCAR (target);
8738
8739 chain = ((EQ (operation, Qinsert_file_contents)
8740 || EQ (operation, Qwrite_region))
8741 ? Vfile_coding_system_alist
8742 : (EQ (operation, Qopen_network_stream)
8743 ? Vnetwork_coding_system_alist
8744 : Vprocess_coding_system_alist));
8745 if (NILP (chain))
8746 return Qnil;
8747
8748 for (; CONSP (chain); chain = XCDR (chain))
8749 {
8750 Lisp_Object elt;
8751
8752 elt = XCAR (chain);
8753 if (CONSP (elt)
8754 && ((STRINGP (target)
8755 && STRINGP (XCAR (elt))
8756 && fast_string_match (XCAR (elt), target) >= 0)
8757 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8758 {
8759 val = XCDR (elt);
8760 /* Here, if VAL is both a valid coding system and a valid
8761 function symbol, we return VAL as a coding system. */
8762 if (CONSP (val))
8763 return val;
8764 if (! SYMBOLP (val))
8765 return Qnil;
8766 if (! NILP (Fcoding_system_p (val)))
8767 return Fcons (val, val);
8768 if (! NILP (Ffboundp (val)))
8769 {
8770 /* We use call1 rather than safe_call1
8771 so as to get bug reports about functions called here
8772 which don't handle the current interface. */
8773 val = call1 (val, Flist (nargs, args));
8774 if (CONSP (val))
8775 return val;
8776 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8777 return Fcons (val, val);
8778 }
8779 return Qnil;
8780 }
8781 }
8782 return Qnil;
8783 }
8784
8785 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8786 Sset_coding_system_priority, 0, MANY, 0,
8787 doc: /* Assign higher priority to the coding systems given as arguments.
8788 If multiple coding systems belongs to the same category,
8789 all but the first one are ignored.
8790
8791 usage: (set-coding-system-priority ...) */)
8792 (nargs, args)
8793 int nargs;
8794 Lisp_Object *args;
8795 {
8796 int i, j;
8797 int changed[coding_category_max];
8798 enum coding_category priorities[coding_category_max];
8799
8800 bzero (changed, sizeof changed);
8801
8802 for (i = j = 0; i < nargs; i++)
8803 {
8804 enum coding_category category;
8805 Lisp_Object spec, attrs;
8806
8807 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8808 attrs = AREF (spec, 0);
8809 category = XINT (CODING_ATTR_CATEGORY (attrs));
8810 if (changed[category])
8811 /* Ignore this coding system because a coding system of the
8812 same category already had a higher priority. */
8813 continue;
8814 changed[category] = 1;
8815 priorities[j++] = category;
8816 if (coding_categories[category].id >= 0
8817 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8818 setup_coding_system (args[i], &coding_categories[category]);
8819 Fset (AREF (Vcoding_category_table, category), args[i]);
8820 }
8821
8822 /* Now we have decided top J priorities. Reflect the order of the
8823 original priorities to the remaining priorities. */
8824
8825 for (i = j, j = 0; i < coding_category_max; i++, j++)
8826 {
8827 while (j < coding_category_max
8828 && changed[coding_priorities[j]])
8829 j++;
8830 if (j == coding_category_max)
8831 abort ();
8832 priorities[i] = coding_priorities[j];
8833 }
8834
8835 bcopy (priorities, coding_priorities, sizeof priorities);
8836
8837 /* Update `coding-category-list'. */
8838 Vcoding_category_list = Qnil;
8839 for (i = coding_category_max - 1; i >= 0; i--)
8840 Vcoding_category_list
8841 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8842 Vcoding_category_list);
8843
8844 return Qnil;
8845 }
8846
8847 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8848 Scoding_system_priority_list, 0, 1, 0,
8849 doc: /* Return a list of coding systems ordered by their priorities.
8850 HIGHESTP non-nil means just return the highest priority one. */)
8851 (highestp)
8852 Lisp_Object highestp;
8853 {
8854 int i;
8855 Lisp_Object val;
8856
8857 for (i = 0, val = Qnil; i < coding_category_max; i++)
8858 {
8859 enum coding_category category = coding_priorities[i];
8860 int id = coding_categories[category].id;
8861 Lisp_Object attrs;
8862
8863 if (id < 0)
8864 continue;
8865 attrs = CODING_ID_ATTRS (id);
8866 if (! NILP (highestp))
8867 return CODING_ATTR_BASE_NAME (attrs);
8868 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8869 }
8870 return Fnreverse (val);
8871 }
8872
8873 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8874
8875 static Lisp_Object
8876 make_subsidiaries (base)
8877 Lisp_Object base;
8878 {
8879 Lisp_Object subsidiaries;
8880 int base_name_len = SBYTES (SYMBOL_NAME (base));
8881 char *buf = (char *) alloca (base_name_len + 6);
8882 int i;
8883
8884 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8885 subsidiaries = Fmake_vector (make_number (3), Qnil);
8886 for (i = 0; i < 3; i++)
8887 {
8888 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8889 ASET (subsidiaries, i, intern (buf));
8890 }
8891 return subsidiaries;
8892 }
8893
8894
8895 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8896 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8897 doc: /* For internal use only.
8898 usage: (define-coding-system-internal ...) */)
8899 (nargs, args)
8900 int nargs;
8901 Lisp_Object *args;
8902 {
8903 Lisp_Object name;
8904 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8905 Lisp_Object attrs; /* Vector of attributes. */
8906 Lisp_Object eol_type;
8907 Lisp_Object aliases;
8908 Lisp_Object coding_type, charset_list, safe_charsets;
8909 enum coding_category category;
8910 Lisp_Object tail, val;
8911 int max_charset_id = 0;
8912 int i;
8913
8914 if (nargs < coding_arg_max)
8915 goto short_args;
8916
8917 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8918
8919 name = args[coding_arg_name];
8920 CHECK_SYMBOL (name);
8921 CODING_ATTR_BASE_NAME (attrs) = name;
8922
8923 val = args[coding_arg_mnemonic];
8924 if (! STRINGP (val))
8925 CHECK_CHARACTER (val);
8926 CODING_ATTR_MNEMONIC (attrs) = val;
8927
8928 coding_type = args[coding_arg_coding_type];
8929 CHECK_SYMBOL (coding_type);
8930 CODING_ATTR_TYPE (attrs) = coding_type;
8931
8932 charset_list = args[coding_arg_charset_list];
8933 if (SYMBOLP (charset_list))
8934 {
8935 if (EQ (charset_list, Qiso_2022))
8936 {
8937 if (! EQ (coding_type, Qiso_2022))
8938 error ("Invalid charset-list");
8939 charset_list = Viso_2022_charset_list;
8940 }
8941 else if (EQ (charset_list, Qemacs_mule))
8942 {
8943 if (! EQ (coding_type, Qemacs_mule))
8944 error ("Invalid charset-list");
8945 charset_list = Vemacs_mule_charset_list;
8946 }
8947 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8948 if (max_charset_id < XFASTINT (XCAR (tail)))
8949 max_charset_id = XFASTINT (XCAR (tail));
8950 }
8951 else
8952 {
8953 charset_list = Fcopy_sequence (charset_list);
8954 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8955 {
8956 struct charset *charset;
8957
8958 val = XCAR (tail);
8959 CHECK_CHARSET_GET_CHARSET (val, charset);
8960 if (EQ (coding_type, Qiso_2022)
8961 ? CHARSET_ISO_FINAL (charset) < 0
8962 : EQ (coding_type, Qemacs_mule)
8963 ? CHARSET_EMACS_MULE_ID (charset) < 0
8964 : 0)
8965 error ("Can't handle charset `%s'",
8966 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8967
8968 XSETCAR (tail, make_number (charset->id));
8969 if (max_charset_id < charset->id)
8970 max_charset_id = charset->id;
8971 }
8972 }
8973 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8974
8975 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8976 make_number (255));
8977 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8978 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8979 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8980
8981 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8982
8983 val = args[coding_arg_decode_translation_table];
8984 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8985 CHECK_SYMBOL (val);
8986 CODING_ATTR_DECODE_TBL (attrs) = val;
8987
8988 val = args[coding_arg_encode_translation_table];
8989 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8990 CHECK_SYMBOL (val);
8991 CODING_ATTR_ENCODE_TBL (attrs) = val;
8992
8993 val = args[coding_arg_post_read_conversion];
8994 CHECK_SYMBOL (val);
8995 CODING_ATTR_POST_READ (attrs) = val;
8996
8997 val = args[coding_arg_pre_write_conversion];
8998 CHECK_SYMBOL (val);
8999 CODING_ATTR_PRE_WRITE (attrs) = val;
9000
9001 val = args[coding_arg_default_char];
9002 if (NILP (val))
9003 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9004 else
9005 {
9006 CHECK_CHARACTER (val);
9007 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9008 }
9009
9010 val = args[coding_arg_for_unibyte];
9011 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9012
9013 val = args[coding_arg_plist];
9014 CHECK_LIST (val);
9015 CODING_ATTR_PLIST (attrs) = val;
9016
9017 if (EQ (coding_type, Qcharset))
9018 {
9019 /* Generate a lisp vector of 256 elements. Each element is nil,
9020 integer, or a list of charset IDs.
9021
9022 If Nth element is nil, the byte code N is invalid in this
9023 coding system.
9024
9025 If Nth element is a number NUM, N is the first byte of a
9026 charset whose ID is NUM.
9027
9028 If Nth element is a list of charset IDs, N is the first byte
9029 of one of them. The list is sorted by dimensions of the
9030 charsets. A charset of smaller dimension comes firtst. */
9031 val = Fmake_vector (make_number (256), Qnil);
9032
9033 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9034 {
9035 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9036 int dim = CHARSET_DIMENSION (charset);
9037 int idx = (dim - 1) * 4;
9038
9039 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9040 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9041
9042 for (i = charset->code_space[idx];
9043 i <= charset->code_space[idx + 1]; i++)
9044 {
9045 Lisp_Object tmp, tmp2;
9046 int dim2;
9047
9048 tmp = AREF (val, i);
9049 if (NILP (tmp))
9050 tmp = XCAR (tail);
9051 else if (NUMBERP (tmp))
9052 {
9053 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9054 if (dim < dim2)
9055 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9056 else
9057 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9058 }
9059 else
9060 {
9061 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9062 {
9063 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9064 if (dim < dim2)
9065 break;
9066 }
9067 if (NILP (tmp2))
9068 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9069 else
9070 {
9071 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9072 XSETCAR (tmp2, XCAR (tail));
9073 }
9074 }
9075 ASET (val, i, tmp);
9076 }
9077 }
9078 ASET (attrs, coding_attr_charset_valids, val);
9079 category = coding_category_charset;
9080 }
9081 else if (EQ (coding_type, Qccl))
9082 {
9083 Lisp_Object valids;
9084
9085 if (nargs < coding_arg_ccl_max)
9086 goto short_args;
9087
9088 val = args[coding_arg_ccl_decoder];
9089 CHECK_CCL_PROGRAM (val);
9090 if (VECTORP (val))
9091 val = Fcopy_sequence (val);
9092 ASET (attrs, coding_attr_ccl_decoder, val);
9093
9094 val = args[coding_arg_ccl_encoder];
9095 CHECK_CCL_PROGRAM (val);
9096 if (VECTORP (val))
9097 val = Fcopy_sequence (val);
9098 ASET (attrs, coding_attr_ccl_encoder, val);
9099
9100 val = args[coding_arg_ccl_valids];
9101 valids = Fmake_string (make_number (256), make_number (0));
9102 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9103 {
9104 int from, to;
9105
9106 val = Fcar (tail);
9107 if (INTEGERP (val))
9108 {
9109 from = to = XINT (val);
9110 if (from < 0 || from > 255)
9111 args_out_of_range_3 (val, make_number (0), make_number (255));
9112 }
9113 else
9114 {
9115 CHECK_CONS (val);
9116 CHECK_NATNUM_CAR (val);
9117 CHECK_NATNUM_CDR (val);
9118 from = XINT (XCAR (val));
9119 if (from > 255)
9120 args_out_of_range_3 (XCAR (val),
9121 make_number (0), make_number (255));
9122 to = XINT (XCDR (val));
9123 if (to < from || to > 255)
9124 args_out_of_range_3 (XCDR (val),
9125 XCAR (val), make_number (255));
9126 }
9127 for (i = from; i <= to; i++)
9128 SSET (valids, i, 1);
9129 }
9130 ASET (attrs, coding_attr_ccl_valids, valids);
9131
9132 category = coding_category_ccl;
9133 }
9134 else if (EQ (coding_type, Qutf_16))
9135 {
9136 Lisp_Object bom, endian;
9137
9138 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9139
9140 if (nargs < coding_arg_utf16_max)
9141 goto short_args;
9142
9143 bom = args[coding_arg_utf16_bom];
9144 if (! NILP (bom) && ! EQ (bom, Qt))
9145 {
9146 CHECK_CONS (bom);
9147 val = XCAR (bom);
9148 CHECK_CODING_SYSTEM (val);
9149 val = XCDR (bom);
9150 CHECK_CODING_SYSTEM (val);
9151 }
9152 ASET (attrs, coding_attr_utf_16_bom, bom);
9153
9154 endian = args[coding_arg_utf16_endian];
9155 CHECK_SYMBOL (endian);
9156 if (NILP (endian))
9157 endian = Qbig;
9158 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9159 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9160 ASET (attrs, coding_attr_utf_16_endian, endian);
9161
9162 category = (CONSP (bom)
9163 ? coding_category_utf_16_auto
9164 : NILP (bom)
9165 ? (EQ (endian, Qbig)
9166 ? coding_category_utf_16_be_nosig
9167 : coding_category_utf_16_le_nosig)
9168 : (EQ (endian, Qbig)
9169 ? coding_category_utf_16_be
9170 : coding_category_utf_16_le));
9171 }
9172 else if (EQ (coding_type, Qiso_2022))
9173 {
9174 Lisp_Object initial, reg_usage, request, flags;
9175 int i;
9176
9177 if (nargs < coding_arg_iso2022_max)
9178 goto short_args;
9179
9180 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9181 CHECK_VECTOR (initial);
9182 for (i = 0; i < 4; i++)
9183 {
9184 val = Faref (initial, make_number (i));
9185 if (! NILP (val))
9186 {
9187 struct charset *charset;
9188
9189 CHECK_CHARSET_GET_CHARSET (val, charset);
9190 ASET (initial, i, make_number (CHARSET_ID (charset)));
9191 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9192 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9193 }
9194 else
9195 ASET (initial, i, make_number (-1));
9196 }
9197
9198 reg_usage = args[coding_arg_iso2022_reg_usage];
9199 CHECK_CONS (reg_usage);
9200 CHECK_NUMBER_CAR (reg_usage);
9201 CHECK_NUMBER_CDR (reg_usage);
9202
9203 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9204 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9205 {
9206 int id;
9207 Lisp_Object tmp;
9208
9209 val = Fcar (tail);
9210 CHECK_CONS (val);
9211 tmp = XCAR (val);
9212 CHECK_CHARSET_GET_ID (tmp, id);
9213 CHECK_NATNUM_CDR (val);
9214 if (XINT (XCDR (val)) >= 4)
9215 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9216 XSETCAR (val, make_number (id));
9217 }
9218
9219 flags = args[coding_arg_iso2022_flags];
9220 CHECK_NATNUM (flags);
9221 i = XINT (flags);
9222 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9223 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9224
9225 ASET (attrs, coding_attr_iso_initial, initial);
9226 ASET (attrs, coding_attr_iso_usage, reg_usage);
9227 ASET (attrs, coding_attr_iso_request, request);
9228 ASET (attrs, coding_attr_iso_flags, flags);
9229 setup_iso_safe_charsets (attrs);
9230
9231 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9232 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9233 | CODING_ISO_FLAG_SINGLE_SHIFT))
9234 ? coding_category_iso_7_else
9235 : EQ (args[coding_arg_charset_list], Qiso_2022)
9236 ? coding_category_iso_7
9237 : coding_category_iso_7_tight);
9238 else
9239 {
9240 int id = XINT (AREF (initial, 1));
9241
9242 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9243 || EQ (args[coding_arg_charset_list], Qiso_2022)
9244 || id < 0)
9245 ? coding_category_iso_8_else
9246 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9247 ? coding_category_iso_8_1
9248 : coding_category_iso_8_2);
9249 }
9250 if (category != coding_category_iso_8_1
9251 && category != coding_category_iso_8_2)
9252 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9253 }
9254 else if (EQ (coding_type, Qemacs_mule))
9255 {
9256 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9257 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9258 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9259 category = coding_category_emacs_mule;
9260 }
9261 else if (EQ (coding_type, Qshift_jis))
9262 {
9263
9264 struct charset *charset;
9265
9266 if (XINT (Flength (charset_list)) != 3
9267 && XINT (Flength (charset_list)) != 4)
9268 error ("There should be three or four charsets");
9269
9270 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9271 if (CHARSET_DIMENSION (charset) != 1)
9272 error ("Dimension of charset %s is not one",
9273 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9274 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9275 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9276
9277 charset_list = XCDR (charset_list);
9278 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9279 if (CHARSET_DIMENSION (charset) != 1)
9280 error ("Dimension of charset %s is not one",
9281 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9282
9283 charset_list = XCDR (charset_list);
9284 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9285 if (CHARSET_DIMENSION (charset) != 2)
9286 error ("Dimension of charset %s is not two",
9287 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9288
9289 charset_list = XCDR (charset_list);
9290 if (! NILP (charset_list))
9291 {
9292 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9293 if (CHARSET_DIMENSION (charset) != 2)
9294 error ("Dimension of charset %s is not two",
9295 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9296 }
9297
9298 category = coding_category_sjis;
9299 Vsjis_coding_system = name;
9300 }
9301 else if (EQ (coding_type, Qbig5))
9302 {
9303 struct charset *charset;
9304
9305 if (XINT (Flength (charset_list)) != 2)
9306 error ("There should be just two charsets");
9307
9308 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9309 if (CHARSET_DIMENSION (charset) != 1)
9310 error ("Dimension of charset %s is not one",
9311 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9312 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9313 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9314
9315 charset_list = XCDR (charset_list);
9316 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9317 if (CHARSET_DIMENSION (charset) != 2)
9318 error ("Dimension of charset %s is not two",
9319 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9320
9321 category = coding_category_big5;
9322 Vbig5_coding_system = name;
9323 }
9324 else if (EQ (coding_type, Qraw_text))
9325 {
9326 category = coding_category_raw_text;
9327 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9328 }
9329 else if (EQ (coding_type, Qutf_8))
9330 {
9331 category = coding_category_utf_8;
9332 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9333 }
9334 else if (EQ (coding_type, Qundecided))
9335 category = coding_category_undecided;
9336 else
9337 error ("Invalid coding system type: %s",
9338 SDATA (SYMBOL_NAME (coding_type)));
9339
9340 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9341 CODING_ATTR_PLIST (attrs)
9342 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9343 CODING_ATTR_PLIST (attrs)));
9344 CODING_ATTR_PLIST (attrs)
9345 = Fcons (QCascii_compatible_p,
9346 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9347 CODING_ATTR_PLIST (attrs)));
9348
9349 eol_type = args[coding_arg_eol_type];
9350 if (! NILP (eol_type)
9351 && ! EQ (eol_type, Qunix)
9352 && ! EQ (eol_type, Qdos)
9353 && ! EQ (eol_type, Qmac))
9354 error ("Invalid eol-type");
9355
9356 aliases = Fcons (name, Qnil);
9357
9358 if (NILP (eol_type))
9359 {
9360 eol_type = make_subsidiaries (name);
9361 for (i = 0; i < 3; i++)
9362 {
9363 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9364
9365 this_name = AREF (eol_type, i);
9366 this_aliases = Fcons (this_name, Qnil);
9367 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9368 this_spec = Fmake_vector (make_number (3), attrs);
9369 ASET (this_spec, 1, this_aliases);
9370 ASET (this_spec, 2, this_eol_type);
9371 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9372 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9373 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9374 if (NILP (val))
9375 Vcoding_system_alist
9376 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9377 Vcoding_system_alist);
9378 }
9379 }
9380
9381 spec_vec = Fmake_vector (make_number (3), attrs);
9382 ASET (spec_vec, 1, aliases);
9383 ASET (spec_vec, 2, eol_type);
9384
9385 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9386 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9387 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9388 if (NILP (val))
9389 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9390 Vcoding_system_alist);
9391
9392 {
9393 int id = coding_categories[category].id;
9394
9395 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9396 setup_coding_system (name, &coding_categories[category]);
9397 }
9398
9399 return Qnil;
9400
9401 short_args:
9402 return Fsignal (Qwrong_number_of_arguments,
9403 Fcons (intern ("define-coding-system-internal"),
9404 make_number (nargs)));
9405 }
9406
9407
9408 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9409 3, 3, 0,
9410 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9411 (coding_system, prop, val)
9412 Lisp_Object coding_system, prop, val;
9413 {
9414 Lisp_Object spec, attrs;
9415
9416 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9417 attrs = AREF (spec, 0);
9418 if (EQ (prop, QCmnemonic))
9419 {
9420 if (! STRINGP (val))
9421 CHECK_CHARACTER (val);
9422 CODING_ATTR_MNEMONIC (attrs) = val;
9423 }
9424 else if (EQ (prop, QCdefalut_char))
9425 {
9426 if (NILP (val))
9427 val = make_number (' ');
9428 else
9429 CHECK_CHARACTER (val);
9430 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9431 }
9432 else if (EQ (prop, QCdecode_translation_table))
9433 {
9434 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9435 CHECK_SYMBOL (val);
9436 CODING_ATTR_DECODE_TBL (attrs) = val;
9437 }
9438 else if (EQ (prop, QCencode_translation_table))
9439 {
9440 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9441 CHECK_SYMBOL (val);
9442 CODING_ATTR_ENCODE_TBL (attrs) = val;
9443 }
9444 else if (EQ (prop, QCpost_read_conversion))
9445 {
9446 CHECK_SYMBOL (val);
9447 CODING_ATTR_POST_READ (attrs) = val;
9448 }
9449 else if (EQ (prop, QCpre_write_conversion))
9450 {
9451 CHECK_SYMBOL (val);
9452 CODING_ATTR_PRE_WRITE (attrs) = val;
9453 }
9454 else if (EQ (prop, QCascii_compatible_p))
9455 {
9456 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9457 }
9458
9459 CODING_ATTR_PLIST (attrs)
9460 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9461 return val;
9462 }
9463
9464
9465 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9466 Sdefine_coding_system_alias, 2, 2, 0,
9467 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9468 (alias, coding_system)
9469 Lisp_Object alias, coding_system;
9470 {
9471 Lisp_Object spec, aliases, eol_type, val;
9472
9473 CHECK_SYMBOL (alias);
9474 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9475 aliases = AREF (spec, 1);
9476 /* ALISES should be a list of length more than zero, and the first
9477 element is a base coding system. Append ALIAS at the tail of the
9478 list. */
9479 while (!NILP (XCDR (aliases)))
9480 aliases = XCDR (aliases);
9481 XSETCDR (aliases, Fcons (alias, Qnil));
9482
9483 eol_type = AREF (spec, 2);
9484 if (VECTORP (eol_type))
9485 {
9486 Lisp_Object subsidiaries;
9487 int i;
9488
9489 subsidiaries = make_subsidiaries (alias);
9490 for (i = 0; i < 3; i++)
9491 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9492 AREF (eol_type, i));
9493 }
9494
9495 Fputhash (alias, spec, Vcoding_system_hash_table);
9496 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9497 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9498 if (NILP (val))
9499 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9500 Vcoding_system_alist);
9501
9502 return Qnil;
9503 }
9504
9505 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9506 1, 1, 0,
9507 doc: /* Return the base of CODING-SYSTEM.
9508 Any alias or subsidiary coding system is not a base coding system. */)
9509 (coding_system)
9510 Lisp_Object coding_system;
9511 {
9512 Lisp_Object spec, attrs;
9513
9514 if (NILP (coding_system))
9515 return (Qno_conversion);
9516 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9517 attrs = AREF (spec, 0);
9518 return CODING_ATTR_BASE_NAME (attrs);
9519 }
9520
9521 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9522 1, 1, 0,
9523 doc: "Return the property list of CODING-SYSTEM.")
9524 (coding_system)
9525 Lisp_Object coding_system;
9526 {
9527 Lisp_Object spec, attrs;
9528
9529 if (NILP (coding_system))
9530 coding_system = Qno_conversion;
9531 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9532 attrs = AREF (spec, 0);
9533 return CODING_ATTR_PLIST (attrs);
9534 }
9535
9536
9537 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9538 1, 1, 0,
9539 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9540 (coding_system)
9541 Lisp_Object coding_system;
9542 {
9543 Lisp_Object spec;
9544
9545 if (NILP (coding_system))
9546 coding_system = Qno_conversion;
9547 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9548 return AREF (spec, 1);
9549 }
9550
9551 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9552 Scoding_system_eol_type, 1, 1, 0,
9553 doc: /* Return eol-type of CODING-SYSTEM.
9554 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9555
9556 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9557 and CR respectively.
9558
9559 A vector value indicates that a format of end-of-line should be
9560 detected automatically. Nth element of the vector is the subsidiary
9561 coding system whose eol-type is N. */)
9562 (coding_system)
9563 Lisp_Object coding_system;
9564 {
9565 Lisp_Object spec, eol_type;
9566 int n;
9567
9568 if (NILP (coding_system))
9569 coding_system = Qno_conversion;
9570 if (! CODING_SYSTEM_P (coding_system))
9571 return Qnil;
9572 spec = CODING_SYSTEM_SPEC (coding_system);
9573 eol_type = AREF (spec, 2);
9574 if (VECTORP (eol_type))
9575 return Fcopy_sequence (eol_type);
9576 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9577 return make_number (n);
9578 }
9579
9580 #endif /* emacs */
9581
9582 \f
9583 /*** 9. Post-amble ***/
9584
9585 void
9586 init_coding_once ()
9587 {
9588 int i;
9589
9590 for (i = 0; i < coding_category_max; i++)
9591 {
9592 coding_categories[i].id = -1;
9593 coding_priorities[i] = i;
9594 }
9595
9596 /* ISO2022 specific initialize routine. */
9597 for (i = 0; i < 0x20; i++)
9598 iso_code_class[i] = ISO_control_0;
9599 for (i = 0x21; i < 0x7F; i++)
9600 iso_code_class[i] = ISO_graphic_plane_0;
9601 for (i = 0x80; i < 0xA0; i++)
9602 iso_code_class[i] = ISO_control_1;
9603 for (i = 0xA1; i < 0xFF; i++)
9604 iso_code_class[i] = ISO_graphic_plane_1;
9605 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9606 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9607 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9608 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9609 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9610 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9611 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9612 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9613 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9614
9615 for (i = 0; i < 256; i++)
9616 {
9617 emacs_mule_bytes[i] = 1;
9618 }
9619 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9620 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9621 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9622 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9623 }
9624
9625 #ifdef emacs
9626
9627 void
9628 syms_of_coding ()
9629 {
9630 staticpro (&Vcoding_system_hash_table);
9631 {
9632 Lisp_Object args[2];
9633 args[0] = QCtest;
9634 args[1] = Qeq;
9635 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9636 }
9637
9638 staticpro (&Vsjis_coding_system);
9639 Vsjis_coding_system = Qnil;
9640
9641 staticpro (&Vbig5_coding_system);
9642 Vbig5_coding_system = Qnil;
9643
9644 staticpro (&Vcode_conversion_reused_workbuf);
9645 Vcode_conversion_reused_workbuf = Qnil;
9646
9647 staticpro (&Vcode_conversion_workbuf_name);
9648 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9649
9650 reused_workbuf_in_use = 0;
9651
9652 DEFSYM (Qcharset, "charset");
9653 DEFSYM (Qtarget_idx, "target-idx");
9654 DEFSYM (Qcoding_system_history, "coding-system-history");
9655 Fset (Qcoding_system_history, Qnil);
9656
9657 /* Target FILENAME is the first argument. */
9658 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9659 /* Target FILENAME is the third argument. */
9660 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9661
9662 DEFSYM (Qcall_process, "call-process");
9663 /* Target PROGRAM is the first argument. */
9664 Fput (Qcall_process, Qtarget_idx, make_number (0));
9665
9666 DEFSYM (Qcall_process_region, "call-process-region");
9667 /* Target PROGRAM is the third argument. */
9668 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9669
9670 DEFSYM (Qstart_process, "start-process");
9671 /* Target PROGRAM is the third argument. */
9672 Fput (Qstart_process, Qtarget_idx, make_number (2));
9673
9674 DEFSYM (Qopen_network_stream, "open-network-stream");
9675 /* Target SERVICE is the fourth argument. */
9676 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9677
9678 DEFSYM (Qcoding_system, "coding-system");
9679 DEFSYM (Qcoding_aliases, "coding-aliases");
9680
9681 DEFSYM (Qeol_type, "eol-type");
9682 DEFSYM (Qunix, "unix");
9683 DEFSYM (Qdos, "dos");
9684
9685 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9686 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9687 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9688 DEFSYM (Qdefault_char, "default-char");
9689 DEFSYM (Qundecided, "undecided");
9690 DEFSYM (Qno_conversion, "no-conversion");
9691 DEFSYM (Qraw_text, "raw-text");
9692
9693 DEFSYM (Qiso_2022, "iso-2022");
9694
9695 DEFSYM (Qutf_8, "utf-8");
9696 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9697
9698 DEFSYM (Qutf_16, "utf-16");
9699 DEFSYM (Qbig, "big");
9700 DEFSYM (Qlittle, "little");
9701
9702 DEFSYM (Qshift_jis, "shift-jis");
9703 DEFSYM (Qbig5, "big5");
9704
9705 DEFSYM (Qcoding_system_p, "coding-system-p");
9706
9707 DEFSYM (Qcoding_system_error, "coding-system-error");
9708 Fput (Qcoding_system_error, Qerror_conditions,
9709 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9710 Fput (Qcoding_system_error, Qerror_message,
9711 build_string ("Invalid coding system"));
9712
9713 /* Intern this now in case it isn't already done.
9714 Setting this variable twice is harmless.
9715 But don't staticpro it here--that is done in alloc.c. */
9716 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9717
9718 DEFSYM (Qtranslation_table, "translation-table");
9719 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9720 DEFSYM (Qtranslation_table_id, "translation-table-id");
9721 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9722 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9723
9724 DEFSYM (Qvalid_codes, "valid-codes");
9725
9726 DEFSYM (Qemacs_mule, "emacs-mule");
9727
9728 DEFSYM (QCcategory, ":category");
9729 DEFSYM (QCmnemonic, ":mnemonic");
9730 DEFSYM (QCdefalut_char, ":default-char");
9731 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9732 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9733 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9734 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9735 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9736
9737 Vcoding_category_table
9738 = Fmake_vector (make_number (coding_category_max), Qnil);
9739 staticpro (&Vcoding_category_table);
9740 /* Followings are target of code detection. */
9741 ASET (Vcoding_category_table, coding_category_iso_7,
9742 intern ("coding-category-iso-7"));
9743 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9744 intern ("coding-category-iso-7-tight"));
9745 ASET (Vcoding_category_table, coding_category_iso_8_1,
9746 intern ("coding-category-iso-8-1"));
9747 ASET (Vcoding_category_table, coding_category_iso_8_2,
9748 intern ("coding-category-iso-8-2"));
9749 ASET (Vcoding_category_table, coding_category_iso_7_else,
9750 intern ("coding-category-iso-7-else"));
9751 ASET (Vcoding_category_table, coding_category_iso_8_else,
9752 intern ("coding-category-iso-8-else"));
9753 ASET (Vcoding_category_table, coding_category_utf_8,
9754 intern ("coding-category-utf-8"));
9755 ASET (Vcoding_category_table, coding_category_utf_16_be,
9756 intern ("coding-category-utf-16-be"));
9757 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9758 intern ("coding-category-utf-16-auto"));
9759 ASET (Vcoding_category_table, coding_category_utf_16_le,
9760 intern ("coding-category-utf-16-le"));
9761 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9762 intern ("coding-category-utf-16-be-nosig"));
9763 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9764 intern ("coding-category-utf-16-le-nosig"));
9765 ASET (Vcoding_category_table, coding_category_charset,
9766 intern ("coding-category-charset"));
9767 ASET (Vcoding_category_table, coding_category_sjis,
9768 intern ("coding-category-sjis"));
9769 ASET (Vcoding_category_table, coding_category_big5,
9770 intern ("coding-category-big5"));
9771 ASET (Vcoding_category_table, coding_category_ccl,
9772 intern ("coding-category-ccl"));
9773 ASET (Vcoding_category_table, coding_category_emacs_mule,
9774 intern ("coding-category-emacs-mule"));
9775 /* Followings are NOT target of code detection. */
9776 ASET (Vcoding_category_table, coding_category_raw_text,
9777 intern ("coding-category-raw-text"));
9778 ASET (Vcoding_category_table, coding_category_undecided,
9779 intern ("coding-category-undecided"));
9780
9781 DEFSYM (Qinsufficient_source, "insufficient-source");
9782 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9783 DEFSYM (Qinvalid_source, "invalid-source");
9784 DEFSYM (Qinterrupted, "interrupted");
9785 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9786 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9787
9788 defsubr (&Scoding_system_p);
9789 defsubr (&Sread_coding_system);
9790 defsubr (&Sread_non_nil_coding_system);
9791 defsubr (&Scheck_coding_system);
9792 defsubr (&Sdetect_coding_region);
9793 defsubr (&Sdetect_coding_string);
9794 defsubr (&Sfind_coding_systems_region_internal);
9795 defsubr (&Sunencodable_char_position);
9796 defsubr (&Scheck_coding_systems_region);
9797 defsubr (&Sdecode_coding_region);
9798 defsubr (&Sencode_coding_region);
9799 defsubr (&Sdecode_coding_string);
9800 defsubr (&Sencode_coding_string);
9801 defsubr (&Sdecode_sjis_char);
9802 defsubr (&Sencode_sjis_char);
9803 defsubr (&Sdecode_big5_char);
9804 defsubr (&Sencode_big5_char);
9805 defsubr (&Sset_terminal_coding_system_internal);
9806 defsubr (&Sset_safe_terminal_coding_system_internal);
9807 defsubr (&Sterminal_coding_system);
9808 defsubr (&Sset_keyboard_coding_system_internal);
9809 defsubr (&Skeyboard_coding_system);
9810 defsubr (&Sfind_operation_coding_system);
9811 defsubr (&Sset_coding_system_priority);
9812 defsubr (&Sdefine_coding_system_internal);
9813 defsubr (&Sdefine_coding_system_alias);
9814 defsubr (&Scoding_system_put);
9815 defsubr (&Scoding_system_base);
9816 defsubr (&Scoding_system_plist);
9817 defsubr (&Scoding_system_aliases);
9818 defsubr (&Scoding_system_eol_type);
9819 defsubr (&Scoding_system_priority_list);
9820
9821 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9822 doc: /* List of coding systems.
9823
9824 Do not alter the value of this variable manually. This variable should be
9825 updated by the functions `define-coding-system' and
9826 `define-coding-system-alias'. */);
9827 Vcoding_system_list = Qnil;
9828
9829 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9830 doc: /* Alist of coding system names.
9831 Each element is one element list of coding system name.
9832 This variable is given to `completing-read' as TABLE argument.
9833
9834 Do not alter the value of this variable manually. This variable should be
9835 updated by the functions `make-coding-system' and
9836 `define-coding-system-alias'. */);
9837 Vcoding_system_alist = Qnil;
9838
9839 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9840 doc: /* List of coding-categories (symbols) ordered by priority.
9841
9842 On detecting a coding system, Emacs tries code detection algorithms
9843 associated with each coding-category one by one in this order. When
9844 one algorithm agrees with a byte sequence of source text, the coding
9845 system bound to the corresponding coding-category is selected.
9846
9847 Don't modify this variable directly, but use `set-coding-priority'. */);
9848 {
9849 int i;
9850
9851 Vcoding_category_list = Qnil;
9852 for (i = coding_category_max - 1; i >= 0; i--)
9853 Vcoding_category_list
9854 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9855 Vcoding_category_list);
9856 }
9857
9858 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9859 doc: /* Specify the coding system for read operations.
9860 It is useful to bind this variable with `let', but do not set it globally.
9861 If the value is a coding system, it is used for decoding on read operation.
9862 If not, an appropriate element is used from one of the coding system alists:
9863 There are three such tables, `file-coding-system-alist',
9864 `process-coding-system-alist', and `network-coding-system-alist'. */);
9865 Vcoding_system_for_read = Qnil;
9866
9867 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9868 doc: /* Specify the coding system for write operations.
9869 Programs bind this variable with `let', but you should not set it globally.
9870 If the value is a coding system, it is used for encoding of output,
9871 when writing it to a file and when sending it to a file or subprocess.
9872
9873 If this does not specify a coding system, an appropriate element
9874 is used from one of the coding system alists:
9875 There are three such tables, `file-coding-system-alist',
9876 `process-coding-system-alist', and `network-coding-system-alist'.
9877 For output to files, if the above procedure does not specify a coding system,
9878 the value of `buffer-file-coding-system' is used. */);
9879 Vcoding_system_for_write = Qnil;
9880
9881 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9882 doc: /*
9883 Coding system used in the latest file or process I/O. */);
9884 Vlast_coding_system_used = Qnil;
9885
9886 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9887 doc: /*
9888 Error status of the last code conversion.
9889
9890 When an error was detected in the last code conversion, this variable
9891 is set to one of the following symbols.
9892 `insufficient-source'
9893 `inconsistent-eol'
9894 `invalid-source'
9895 `interrupted'
9896 `insufficient-memory'
9897 When no error was detected, the value doesn't change. So, to check
9898 the error status of a code conversion by this variable, you must
9899 explicitly set this variable to nil before performing code
9900 conversion. */);
9901 Vlast_code_conversion_error = Qnil;
9902
9903 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9904 doc: /*
9905 *Non-nil means always inhibit code conversion of end-of-line format.
9906 See info node `Coding Systems' and info node `Text and Binary' concerning
9907 such conversion. */);
9908 inhibit_eol_conversion = 0;
9909
9910 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9911 doc: /*
9912 Non-nil means process buffer inherits coding system of process output.
9913 Bind it to t if the process output is to be treated as if it were a file
9914 read from some filesystem. */);
9915 inherit_process_coding_system = 0;
9916
9917 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9918 doc: /*
9919 Alist to decide a coding system to use for a file I/O operation.
9920 The format is ((PATTERN . VAL) ...),
9921 where PATTERN is a regular expression matching a file name,
9922 VAL is a coding system, a cons of coding systems, or a function symbol.
9923 If VAL is a coding system, it is used for both decoding and encoding
9924 the file contents.
9925 If VAL is a cons of coding systems, the car part is used for decoding,
9926 and the cdr part is used for encoding.
9927 If VAL is a function symbol, the function must return a coding system
9928 or a cons of coding systems which are used as above. The function is
9929 called with an argument that is a list of the arguments with which
9930 `find-operation-coding-system' was called. If the function can't decide
9931 a coding system, it can return `undecided' so that the normal
9932 code-detection is performed.
9933
9934 See also the function `find-operation-coding-system'
9935 and the variable `auto-coding-alist'. */);
9936 Vfile_coding_system_alist = Qnil;
9937
9938 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9939 doc: /*
9940 Alist to decide a coding system to use for a process I/O operation.
9941 The format is ((PATTERN . VAL) ...),
9942 where PATTERN is a regular expression matching a program name,
9943 VAL is a coding system, a cons of coding systems, or a function symbol.
9944 If VAL is a coding system, it is used for both decoding what received
9945 from the program and encoding what sent to the program.
9946 If VAL is a cons of coding systems, the car part is used for decoding,
9947 and the cdr part is used for encoding.
9948 If VAL is a function symbol, the function must return a coding system
9949 or a cons of coding systems which are used as above.
9950
9951 See also the function `find-operation-coding-system'. */);
9952 Vprocess_coding_system_alist = Qnil;
9953
9954 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9955 doc: /*
9956 Alist to decide a coding system to use for a network I/O operation.
9957 The format is ((PATTERN . VAL) ...),
9958 where PATTERN is a regular expression matching a network service name
9959 or is a port number to connect to,
9960 VAL is a coding system, a cons of coding systems, or a function symbol.
9961 If VAL is a coding system, it is used for both decoding what received
9962 from the network stream and encoding what sent to the network stream.
9963 If VAL is a cons of coding systems, the car part is used for decoding,
9964 and the cdr part is used for encoding.
9965 If VAL is a function symbol, the function must return a coding system
9966 or a cons of coding systems which are used as above.
9967
9968 See also the function `find-operation-coding-system'. */);
9969 Vnetwork_coding_system_alist = Qnil;
9970
9971 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9972 doc: /* Coding system to use with system messages.
9973 Also used for decoding keyboard input on X Window system. */);
9974 Vlocale_coding_system = Qnil;
9975
9976 /* The eol mnemonics are reset in startup.el system-dependently. */
9977 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9978 doc: /*
9979 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9980 eol_mnemonic_unix = build_string (":");
9981
9982 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9983 doc: /*
9984 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9985 eol_mnemonic_dos = build_string ("\\");
9986
9987 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9988 doc: /*
9989 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9990 eol_mnemonic_mac = build_string ("/");
9991
9992 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9993 doc: /*
9994 *String displayed in mode line when end-of-line format is not yet determined. */);
9995 eol_mnemonic_undecided = build_string (":");
9996
9997 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9998 doc: /*
9999 *Non-nil enables character translation while encoding and decoding. */);
10000 Venable_character_translation = Qt;
10001
10002 DEFVAR_LISP ("standard-translation-table-for-decode",
10003 &Vstandard_translation_table_for_decode,
10004 doc: /* Table for translating characters while decoding. */);
10005 Vstandard_translation_table_for_decode = Qnil;
10006
10007 DEFVAR_LISP ("standard-translation-table-for-encode",
10008 &Vstandard_translation_table_for_encode,
10009 doc: /* Table for translating characters while encoding. */);
10010 Vstandard_translation_table_for_encode = Qnil;
10011
10012 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10013 doc: /* Alist of charsets vs revision numbers.
10014 While encoding, if a charset (car part of an element) is found,
10015 designate it with the escape sequence identifying revision (cdr part
10016 of the element). */);
10017 Vcharset_revision_table = Qnil;
10018
10019 DEFVAR_LISP ("default-process-coding-system",
10020 &Vdefault_process_coding_system,
10021 doc: /* Cons of coding systems used for process I/O by default.
10022 The car part is used for decoding a process output,
10023 the cdr part is used for encoding a text to be sent to a process. */);
10024 Vdefault_process_coding_system = Qnil;
10025
10026 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10027 doc: /*
10028 Table of extra Latin codes in the range 128..159 (inclusive).
10029 This is a vector of length 256.
10030 If Nth element is non-nil, the existence of code N in a file
10031 \(or output of subprocess) doesn't prevent it to be detected as
10032 a coding system of ISO 2022 variant which has a flag
10033 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10034 or reading output of a subprocess.
10035 Only 128th through 159th elements has a meaning. */);
10036 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10037
10038 DEFVAR_LISP ("select-safe-coding-system-function",
10039 &Vselect_safe_coding_system_function,
10040 doc: /*
10041 Function to call to select safe coding system for encoding a text.
10042
10043 If set, this function is called to force a user to select a proper
10044 coding system which can encode the text in the case that a default
10045 coding system used in each operation can't encode the text. The
10046 function should take care that the buffer is not modified while
10047 the coding system is being selected.
10048
10049 The default value is `select-safe-coding-system' (which see). */);
10050 Vselect_safe_coding_system_function = Qnil;
10051
10052 DEFVAR_BOOL ("coding-system-require-warning",
10053 &coding_system_require_warning,
10054 doc: /* Internal use only.
10055 If non-nil, on writing a file, `select-safe-coding-system-function' is
10056 called even if `coding-system-for-write' is non-nil. The command
10057 `universal-coding-system-argument' binds this variable to t temporarily. */);
10058 coding_system_require_warning = 0;
10059
10060
10061 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10062 &inhibit_iso_escape_detection,
10063 doc: /*
10064 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
10065
10066 By default, on reading a file, Emacs tries to detect how the text is
10067 encoded. This code detection is sensitive to escape sequences. If
10068 the sequence is valid as ISO2022, the code is determined as one of
10069 the ISO2022 encodings, and the file is decoded by the corresponding
10070 coding system (e.g. `iso-2022-7bit').
10071
10072 However, there may be a case that you want to read escape sequences in
10073 a file as is. In such a case, you can set this variable to non-nil.
10074 Then, as the code detection ignores any escape sequences, no file is
10075 detected as encoded in some ISO2022 encoding. The result is that all
10076 escape sequences become visible in a buffer.
10077
10078 The default value is nil, and it is strongly recommended not to change
10079 it. That is because many Emacs Lisp source files that contain
10080 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10081 in Emacs's distribution, and they won't be decoded correctly on
10082 reading if you suppress escape sequence detection.
10083
10084 The other way to read escape sequences in a file without decoding is
10085 to explicitly specify some coding system that doesn't use ISO2022's
10086 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10087 inhibit_iso_escape_detection = 0;
10088
10089 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10090 doc: /* Char table for translating self-inserting characters.
10091 This is applied to the result of input methods, not their input. See also
10092 `keyboard-translate-table'. */);
10093 Vtranslation_table_for_input = Qnil;
10094
10095 {
10096 Lisp_Object args[coding_arg_max];
10097 Lisp_Object plist[16];
10098 int i;
10099
10100 for (i = 0; i < coding_arg_max; i++)
10101 args[i] = Qnil;
10102
10103 plist[0] = intern (":name");
10104 plist[1] = args[coding_arg_name] = Qno_conversion;
10105 plist[2] = intern (":mnemonic");
10106 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10107 plist[4] = intern (":coding-type");
10108 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10109 plist[6] = intern (":ascii-compatible-p");
10110 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10111 plist[8] = intern (":default-char");
10112 plist[9] = args[coding_arg_default_char] = make_number (0);
10113 plist[10] = intern (":for-unibyte");
10114 plist[11] = args[coding_arg_for_unibyte] = Qt;
10115 plist[12] = intern (":docstring");
10116 plist[13] = build_string ("Do no conversion.\n\
10117 \n\
10118 When you visit a file with this coding, the file is read into a\n\
10119 unibyte buffer as is, thus each byte of a file is treated as a\n\
10120 character.");
10121 plist[14] = intern (":eol-type");
10122 plist[15] = args[coding_arg_eol_type] = Qunix;
10123 args[coding_arg_plist] = Flist (16, plist);
10124 Fdefine_coding_system_internal (coding_arg_max, args);
10125
10126 plist[1] = args[coding_arg_name] = Qundecided;
10127 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10128 plist[5] = args[coding_arg_coding_type] = Qundecided;
10129 /* This is already set.
10130 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10131 plist[8] = intern (":charset-list");
10132 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10133 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10134 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10135 plist[15] = args[coding_arg_eol_type] = Qnil;
10136 args[coding_arg_plist] = Flist (16, plist);
10137 Fdefine_coding_system_internal (coding_arg_max, args);
10138 }
10139
10140 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10141
10142 {
10143 int i;
10144
10145 for (i = 0; i < coding_category_max; i++)
10146 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10147 }
10148 #if defined (MSDOS) || defined (WINDOWSNT)
10149 system_eol_type = Qdos;
10150 #else
10151 system_eol_type = Qunix;
10152 #endif
10153 staticpro (&system_eol_type);
10154 }
10155
10156 char *
10157 emacs_strerror (error_number)
10158 int error_number;
10159 {
10160 char *str;
10161
10162 synchronize_system_messages_locale ();
10163 str = strerror (error_number);
10164
10165 if (! NILP (Vlocale_coding_system))
10166 {
10167 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10168 Vlocale_coding_system,
10169 0);
10170 str = (char *) SDATA (dec);
10171 }
10172
10173 return str;
10174 }
10175
10176 #endif /* emacs */
10177
10178 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10179 (do not change this comment) */