/* Coding system handler (conversion, detection, and etc).
- Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
+ Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN.
Licensed to the Free Software Foundation.
- Copyright (C) 2001,2002 Free Software Foundation, Inc.
+ Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
This file is part of GNU Emacs.
end-of-line format. */
Lisp_Object Qemacs_mule, Qraw_text;
+Lisp_Object Qutf_8;
+
/* Coding-systems are handed between Emacs Lisp programs and C internal
routines by the following three variables. */
/* Coding-system for reading files and receiving data from process. */
#define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \
do { \
int bytes; \
- \
+ \
c = SAFE_ONE_MORE_BYTE (); \
if (c < 0) \
break; \
break; \
*p++ = c; \
} \
- if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)) \
+ if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \
+ || (coding->flags /* We are recovering a file. */ \
+ && p0[0] == LEADING_CODE_8_BIT_CONTROL \
+ && ! CHAR_HEAD_P (p0[1]))) \
c = STRING_CHAR (p0, bytes); \
else \
c = -1; \
else
{
int bytes;
- if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
+ if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
+ || (coding->flags /* We are recovering a file. */
+ && src[0] == LEADING_CODE_8_BIT_CONTROL
+ && ! CHAR_HEAD_P (src[1])))
c = STRING_CHAR (src, bytes);
else
c = *src, bytes = 1;
p = tmp;
src++;
}
- else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
+ else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
+ || (coding->flags /* We are recovering a file. */
+ && src[0] == LEADING_CODE_8_BIT_CONTROL
+ && ! CHAR_HEAD_P (src[1])))
{
p = src;
src += bytes;
EMIT_ONE_BYTE ('\r');
}
else if (SINGLE_BYTE_CHAR_P (c))
- EMIT_ONE_BYTE (c);
+ {
+ if (coding->flags && ! ASCII_BYTE_P (c))
+ {
+ /* As we are auto saving, retain the multibyte form for
+ 8-bit chars. */
+ unsigned char buf[MAX_MULTIBYTE_LENGTH];
+ int bytes = CHAR_STRING (c, buf);
+
+ if (bytes == 1)
+ EMIT_ONE_BYTE (buf[0]);
+ else
+ EMIT_TWO_BYTES (buf[0], buf[1]);
+ }
+ else
+ EMIT_ONE_BYTE (c);
+ }
else
EMIT_BYTES (src_base, src);
coding->consumed_char++;
}
continue;
+ case '%':
+ if (COMPOSING_P (coding))
+ DECODE_COMPOSITION_END ('1');
+ ONE_MORE_BYTE (c1);
+ if (c1 == '/')
+ {
+ /* CTEXT extended segment:
+ ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
+ We keep these bytes as is for the moment.
+ They may be decoded by post-read-conversion. */
+ int dim, M, L;
+ int size, required;
+ int produced_chars;
+
+ ONE_MORE_BYTE (dim);
+ ONE_MORE_BYTE (M);
+ ONE_MORE_BYTE (L);
+ size = ((M - 128) * 128) + (L - 128);
+ required = 8 + size * 2;
+ if (dst + required > (dst_bytes ? dst_end : src))
+ goto label_end_of_loop;
+ *dst++ = ISO_CODE_ESC;
+ *dst++ = '%';
+ *dst++ = '/';
+ *dst++ = dim;
+ produced_chars = 4;
+ dst += CHAR_STRING (M, dst), produced_chars++;
+ dst += CHAR_STRING (L, dst), produced_chars++;
+ while (size-- > 0)
+ {
+ ONE_MORE_BYTE (c1);
+ dst += CHAR_STRING (c1, dst), produced_chars++;
+ }
+ coding->produced_char += produced_chars;
+ }
+ else if (c1 == 'G')
+ {
+ unsigned char *d = dst;
+ int produced_chars;
+
+ /* XFree86 extension for embedding UTF-8 in CTEXT:
+ ESC % G --UTF-8-BYTES-- ESC % @
+ We keep these bytes as is for the moment.
+ They may be decoded by post-read-conversion. */
+ if (d + 6 > (dst_bytes ? dst_end : src))
+ goto label_end_of_loop;
+ *d++ = ISO_CODE_ESC;
+ *d++ = '%';
+ *d++ = 'G';
+ produced_chars = 3;
+ while (d + 1 < (dst_bytes ? dst_end : src))
+ {
+ ONE_MORE_BYTE (c1);
+ if (c1 == ISO_CODE_ESC
+ && src + 1 < src_end
+ && src[0] == '%'
+ && src[1] == '@')
+ break;
+ d += CHAR_STRING (c1, d), produced_chars++;
+ }
+ if (d + 3 > (dst_bytes ? dst_end : src))
+ goto label_end_of_loop;
+ *d++ = ISO_CODE_ESC;
+ *d++ = '%';
+ *d++ = '@';
+ dst = d;
+ coding->produced_char += produced_chars + 3;
+ }
+ else
+ goto label_invalid_code;
+ continue;
+
default:
if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
goto label_invalid_code;
/* Instead of encoding character C, produce one or two `?'s. */
-#define ENCODE_UNSAFE_CHARACTER(c) \
- do { \
- ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
- if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
- ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \
+#define ENCODE_UNSAFE_CHARACTER(c) \
+ do { \
+ ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
+ if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \
+ ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \
} while (0)
Lisp_Object translation_table;
Lisp_Object safe_chars;
+ if (coding->flags & CODING_FLAG_ISO_SAFE)
+ coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
+
safe_chars = coding_safe_chars (coding->symbol);
if (NILP (Venable_character_translation))
}
else
{
- if (coding->flags & CODING_FLAG_ISO_SAFE
+ if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
&& ! CODING_SAFE_CHAR_P (safe_chars, c))
ENCODE_UNSAFE_CHARACTER (c);
else
*dst++ = c;
coding->errors++;
}
- else if (coding->flags & CODING_FLAG_ISO_SAFE
+ else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
&& ! CODING_SAFE_CHAR_P (safe_chars, c))
ENCODE_UNSAFE_CHARACTER (c);
else
EMIT_ONE_BYTE (c1 | 0x80);
else if (charset == charset_latin_jisx0201)
EMIT_ONE_BYTE (c1);
+ else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
+ {
+ EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
+ if (CHARSET_WIDTH (charset) > 1)
+ EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
+ }
else
/* There's no way other than producing the internal
codes as is. */
ENCODE_BIG5 (charset, c1, c2, c1, c2);
EMIT_TWO_BYTES (c1, c2);
}
+ else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
+ {
+ EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
+ if (CHARSET_WIDTH (charset) > 1)
+ EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
+ }
else
/* There's no way other than producing the internal
codes as is. */
int magnification;
if (coding->type == coding_type_ccl)
- magnification = coding->spec.ccl.encoder.buf_magnification;
+ {
+ magnification = coding->spec.ccl.encoder.buf_magnification;
+ if (coding->eol_type == CODING_EOL_CRLF)
+ magnification *= 2;
+ }
else if (CODING_REQUIRE_ENCODING (coding))
magnification = 3;
else
if (ccl->eol_type ==CODING_EOL_UNDECIDED)
ccl->eol_type = CODING_EOL_LF;
ccl->cr_consumed = coding->spec.ccl.cr_carryover;
+ ccl->eight_bit_control = coding->dst_multibyte;
}
+ else
+ ccl->eight_bit_control = 1;
ccl->multibyte = coding->src_multibyte;
if (coding->spec.ccl.eight_bit_carryover[0] != 0)
{
shrinked_bytes = from + (SBYTES (str) - to_byte);
}
- if (!require_decoding)
+ if (!require_decoding
+ && !(SYMBOLP (coding->post_read_conversion)
+ && !NILP (Ffboundp (coding->post_read_conversion))))
{
coding->consumed = SBYTES (str);
coding->consumed_char = SCHARS (str);
STRING_MULTIBYTE (string));
}
-static int coding_system_accept_latin_extra_p P_ ((Lisp_Object));
-
-static int
-coding_system_accept_latin_extra_p (coding_system)
- Lisp_Object coding_system;
-{
- Lisp_Object coding_spec, coding_type, flags;
-
- coding_spec = Fget (coding_system, Qcoding_system);
- if (! VECTORP (coding_spec)
- || ASIZE (coding_spec) != 5)
- return 0;
- coding_type = AREF (coding_spec, 0);
- if (! EQ (coding_type, make_number (2)))
- return 0;
- flags = AREF (coding_spec, 4);
- return (VECTORP (flags)
- && ! NILP (AREF (flags, CODING_FLAG_ISO_LATIN_EXTRA)));
-}
-
/* Subroutine for Fsafe_coding_systems_region_internal.
Return a list of coding systems that safely encode the multibyte
- text between P and PEND. SAFE_CODINGS, if non-nil, is a list of
+ text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of
possible coding systems. If it is nil, it means that we have not
yet found any coding systems.
Lisp_Object safe_codings, work_table;
int *single_byte_char_found;
{
- int c, len, i;
+ int c, len;
Lisp_Object val, ch;
Lisp_Object prev, tail;
if (SINGLE_BYTE_CHAR_P (c))
*single_byte_char_found = 1;
if (NILP (safe_codings))
- /* Already all coding systems are excluded. */
+ /* Already all coding systems are excluded. But, we can't
+ terminate the loop here because non-ASCII single-byte char
+ must be found. */
continue;
/* Check the safe coding systems for C. */
ch = make_number (c);
for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
{
- val = XCAR (tail);
- if (NILP (Faref (XCDR (val), ch))
- && !(SINGLE_BYTE_CHAR_P (c)
- && VECTORP (Vlatin_extra_code_table)
- && ! NILP (AREF (Vlatin_extra_code_table, c))
- && coding_system_accept_latin_extra_p (XCAR (val))))
+ Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
+ int encodable;
+
+ elt = XCAR (tail);
+ if (CONSP (XCDR (elt)))
{
- /* Exclued this coding system from SAFE_CODINGS. */
+ /* This entry has this format now:
+ ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
+ ACCEPT-LATIN-EXTRA ) */
+ val = XCDR (elt);
+ encodable = ! NILP (Faref (XCAR (val), ch));
+ if (! encodable)
+ {
+ val = XCDR (val);
+ translation_table = XCAR (val);
+ hash_table = XCAR (XCDR (val));
+ accept_latin_extra = XCAR (XCDR (XCDR (val)));
+ }
+ }
+ else
+ {
+ /* This entry has this format now: ( CODING . SAFE-CHARS) */
+ encodable = ! NILP (Faref (XCDR (elt), ch));
+ if (! encodable)
+ {
+ /* Transform the format to:
+ ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
+ ACCEPT-LATIN-EXTRA ) */
+ val = Fget (XCAR (elt), Qcoding_system);
+ translation_table
+ = Fplist_get (AREF (val, 3),
+ Qtranslation_table_for_encode);
+ if (SYMBOLP (translation_table))
+ translation_table = Fget (translation_table,
+ Qtranslation_table);
+ hash_table
+ = (CHAR_TABLE_P (translation_table)
+ ? XCHAR_TABLE (translation_table)->extras[1]
+ : Qnil);
+ accept_latin_extra
+ = ((EQ (AREF (val, 0), make_number (2))
+ && VECTORP (AREF (val, 4)))
+ ? AREF (AREF (val, 4), 16)
+ : Qnil);
+ XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
+ translation_table, hash_table,
+ accept_latin_extra));
+ }
+ }
+
+ if (! encodable
+ && ((CHAR_TABLE_P (translation_table)
+ && ! NILP (Faref (translation_table, ch)))
+ || (HASH_TABLE_P (hash_table)
+ && ! NILP (Fgethash (ch, hash_table, Qnil)))
+ || (SINGLE_BYTE_CHAR_P (c)
+ && ! NILP (accept_latin_extra)
+ && VECTORP (Vlatin_extra_code_table)
+ && ! NILP (AREF (Vlatin_extra_code_table, c)))))
+ encodable = 1;
+ if (encodable)
+ prev = tail;
+ else
+ {
+ /* Exclude this coding system from SAFE_CODINGS. */
if (EQ (tail, safe_codings))
safe_codings = XCDR (safe_codings);
else
XSETCDR (prev, XCDR (tail));
}
- else
- prev = tail;
}
}
return safe_codings;
CHECK_SYMBOL (coding_system);
setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
/* We had better not send unsafe characters to terminal. */
- terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
+ terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
/* Character composition should be disabled. */
terminal_coding.composing = COMPOSITION_DISABLED;
/* Error notification should be suppressed. */
Qtranslation_table = intern ("translation-table");
staticpro (&Qtranslation_table);
- Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
+ Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
Qtranslation_table_id = intern ("translation-table-id");
staticpro (&Qtranslation_table_id);
Qraw_text = intern ("raw-text");
staticpro (&Qraw_text);
+ Qutf_8 = intern ("utf-8");
+ staticpro (&Qutf_8);
+
defsubr (&Scoding_system_p);
defsubr (&Sread_coding_system);
defsubr (&Sread_non_nil_coding_system);
Vcoding_system_for_write = Qnil;
DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
- doc: /* Coding system used in the latest file or process I/O. */);
+ doc: /* Coding system used in the latest file or process I/O.
+Also set by `encode-coding-region', `decode-coding-region',
+`encode-coding-string' and `decode-coding-string'. */);
Vlast_coding_system_used = Qnil;
DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,