This file is part of GNU Emacs.
-GNU Emacs is free software; you can redistribute it and/or modify
+GNU Emacs is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with GNU Emacs; see the file COPYING. If not, write to
-the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-Boston, MA 02110-1301, USA. */
+along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
/*** TABLE OF CONTENTS ***
| CATEGORY_MASK_ISO_7_ELSE \
| CATEGORY_MASK_ISO_8_ELSE \
| CATEGORY_MASK_UTF_8 \
+ | CATEGORY_MASK_UTF_16_AUTO \
| CATEGORY_MASK_UTF_16_BE \
| CATEGORY_MASK_UTF_16_LE \
| CATEGORY_MASK_UTF_16_BE_NOSIG \
| CATEGORY_MASK_ISO_ELSE)
#define CATEGORY_MASK_UTF_16 \
- (CATEGORY_MASK_UTF_16_BE \
+ (CATEGORY_MASK_UTF_16_AUTO \
+ | CATEGORY_MASK_UTF_16_BE \
| CATEGORY_MASK_UTF_16_LE \
| CATEGORY_MASK_UTF_16_BE_NOSIG \
| CATEGORY_MASK_UTF_16_LE_NOSIG)
static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
static int decode_coding P_ ((struct coding_system *));
static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
- struct coding_system *,
+ struct coding_system *,
int *, EMACS_INT *));
static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
struct coding_system *,
| CATEGORY_MASK_UTF_16_BE_NOSIG
| CATEGORY_MASK_UTF_16_LE_NOSIG);
}
- else if (c1 >= 0 && c2 >= 0)
+ else
{
+ /* We check the dispersion of Eth and Oth bytes where E is even and
+ O is odd. If both are high, we assume binary data.*/
+ unsigned char e[256], o[256];
+ unsigned e_num = 1, o_num = 1;
+
+ memset (e, 0, 256);
+ memset (o, 0, 256);
+ e[c1] = 1;
+ o[c2] = 1;
+
detect_info->rejected
|= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
+
+ while (1)
+ {
+ ONE_MORE_BYTE (c1);
+ ONE_MORE_BYTE (c2);
+ if (! e[c1])
+ {
+ e[c1] = 1;
+ e_num++;
+ if (e_num >= 128)
+ break;
+ }
+ if (! o[c2])
+ {
+ o[c1] = 1;
+ o_num++;
+ if (o_num >= 128)
+ break;
+ }
+ }
+ detect_info->rejected |= CATEGORY_MASK_UTF_16;
+ return 0;
}
+
no_more_source:
return 1;
}
/* Perhaps the start of composite character. We simple skip
it because analyzing it is too heavy for detecting. But,
at least, we check that the composite character
- constitues of more than 4 bytes. */
+ constitutes of more than 4 bytes. */
const unsigned char *src_base;
repeat:
struct coding_system *this = &(coding_categories[i]);
Lisp_Object attrs, val;
+ if (this->id < 0)
+ continue;
attrs = CODING_ID_ATTRS (this->id);
if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
&& ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
else
{
ASSURE_DESTINATION (ccl.produced);
- for (i = 0; i < ccl.produced; i++)
+ for (i = 0; i < ccl.produced; i++)
*dst++ = destination_charbuf[i] & 0xFF;
produced_chars += ccl.produced;
}
*dst++ = CHAR_TO_BYTE8 (c);
else
CHAR_STRING_ADVANCE (c, dst);
- produced_chars++;
}
}
else
ASSURE_DESTINATION (charbuf_end - charbuf);
while (charbuf < charbuf_end && dst < dst_end)
*dst++ = *charbuf++;
- produced_chars = dst - (coding->destination + coding->dst_bytes);
}
+ produced_chars = charbuf - coding->charbuf;
}
record_conversion_result (coding, CODING_RESULT_SUCCESS);
coding->produced_char += produced_chars;
if (src == src_end)
goto too_short;
ONE_MORE_BYTE (c);
- if (c < charset->code_space[(dim - 1 - idx) * 2]
+ if (c < charset->code_space[(dim - 1 - idx) * 2]
|| c > charset->code_space[(dim - 1 - idx) * 2 + 1])
break;
}
{
int c, i;
struct coding_detection_info detect_info;
+ int null_byte_found = 0, eight_bit_found = 0;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
- for (i = 0, src = coding->source; src < src_end; i++, src++)
+ coding->head_ascii = -1;
+ for (src = coding->source; src < src_end; src++)
{
c = *src;
if (c & 0x80)
- break;
- if (c < 0x20
- && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- && ! inhibit_iso_escape_detection
- && ! detect_info.checked)
{
- coding->head_ascii = src - (coding->source + coding->consumed);
- if (detect_coding_iso_2022 (coding, &detect_info))
+ eight_bit_found = 1;
+ if (coding->head_ascii < 0)
+ coding->head_ascii = src - coding->source;
+ if (null_byte_found)
+ break;
+ }
+ else if (c < 0x20)
+ {
+ if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ && ! inhibit_iso_escape_detection
+ && ! detect_info.checked)
{
- /* We have scanned the whole data. */
- if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. */
- src = src_end;
- break;
+ if (coding->head_ascii < 0)
+ coding->head_ascii = src - coding->source;
+ if (detect_coding_iso_2022 (coding, &detect_info))
+ {
+ /* We have scanned the whole data. */
+ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
+ /* We didn't find an 8-bit code. We may have
+ found a null-byte, but it's very rare that
+ a binary file confirm to ISO-2022. */
+ src = src_end;
+ break;
+ }
+ }
+ else if (! c)
+ {
+ null_byte_found = 1;
+ if (eight_bit_found)
+ break;
}
}
}
- coding->head_ascii = src - (coding->source + coding->consumed);
+ if (coding->head_ascii < 0)
+ coding->head_ascii = src - coding->source;
- if (coding->head_ascii < coding->src_bytes
+ if (null_byte_found || eight_bit_found
+ || coding->head_ascii < coding->src_bytes
|| detect_info.found)
{
enum coding_category category;
break;
}
else
- for (i = 0; i < coding_category_raw_text; i++)
- {
- category = coding_priorities[i];
- this = coding_categories + category;
- if (this->id < 0)
- {
- /* No coding system of this category is defined. */
- detect_info.rejected |= (1 << category);
- }
- else if (category >= coding_category_raw_text)
- continue;
- else if (detect_info.checked & (1 << category))
- {
- if (detect_info.found & (1 << category))
- break;
- }
- else if ((*(this->detector)) (coding, &detect_info)
- && detect_info.found & (1 << category))
- {
- if (category == coding_category_utf_16_auto)
- {
- if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
- category = coding_category_utf_16_le;
- else
- category = coding_category_utf_16_be;
- }
- break;
- }
- }
-
- if (i < coding_category_raw_text)
- setup_coding_system (CODING_ID_NAME (this->id), coding);
- else if (detect_info.rejected == CATEGORY_MASK_ANY)
- setup_coding_system (Qraw_text, coding);
- else if (detect_info.rejected)
- for (i = 0; i < coding_category_raw_text; i++)
- if (! (detect_info.rejected & (1 << coding_priorities[i])))
+ {
+ if (null_byte_found)
{
- this = coding_categories + coding_priorities[i];
- setup_coding_system (CODING_ID_NAME (this->id), coding);
- break;
+ detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+ detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
}
+ for (i = 0; i < coding_category_raw_text; i++)
+ {
+ category = coding_priorities[i];
+ this = coding_categories + category;
+ if (this->id < 0)
+ {
+ /* No coding system of this category is defined. */
+ detect_info.rejected |= (1 << category);
+ }
+ else if (category >= coding_category_raw_text)
+ continue;
+ else if (detect_info.checked & (1 << category))
+ {
+ if (detect_info.found & (1 << category))
+ break;
+ }
+ else if ((*(this->detector)) (coding, &detect_info)
+ && detect_info.found & (1 << category))
+ {
+ if (category == coding_category_utf_16_auto)
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+ category = coding_category_utf_16_le;
+ else
+ category = coding_category_utf_16_be;
+ }
+ break;
+ }
+ }
+
+ if (i < coding_category_raw_text)
+ setup_coding_system (CODING_ID_NAME (this->id), coding);
+ else if (null_byte_found)
+ setup_coding_system (Qno_conversion, coding);
+ else if ((detect_info.rejected & CATEGORY_MASK_ANY)
+ == CATEGORY_MASK_ANY)
+ setup_coding_system (Qraw_text, coding);
+ else if (detect_info.rejected)
+ for (i = 0; i < coding_category_raw_text; i++)
+ if (! (detect_info.rejected & (1 << coding_priorities[i])))
+ {
+ this = coding_categories + coding_priorities[i];
+ setup_coding_system (CODING_ID_NAME (this->id), coding);
+ break;
+ }
+ }
}
}
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
{
Lisp_Object eol_type;
unsigned char *p, *pbeg, *pend;
-
+
eol_type = CODING_ID_EOL_TYPE (coding->id);
if (EQ (eol_type, Qunix))
return;
}
current = current_buffer;
set_buffer_internal (XBUFFER (workbuf));
- Ferase_buffer ();
+ Ferase_buffer ();
current_buffer->undo_list = Qt;
current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
set_buffer_internal (current);
{
code_conversion_save (0, 0);
coding->dst_object = Qnil;
- coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
+ /* Most callers presume this will return a multibyte result, and they
+ won't use `binary' or `raw-text' anyway, so let's not worry about
+ CODING_FOR_UNIBYTE. */
+ coding->dst_multibyte = 1;
}
decode_coding (coding);
doc: /* Return t if OBJECT is nil or a coding-system.
See the documentation of `define-coding-system' for information
about coding-system objects. */)
- (obj)
- Lisp_Object obj;
+ (object)
+ Lisp_Object object;
{
- if (NILP (obj)
- || CODING_SYSTEM_ID (obj) >= 0)
+ if (NILP (object)
+ || CODING_SYSTEM_ID (object) >= 0)
return Qt;
- if (! SYMBOLP (obj)
- || NILP (Fget (obj, Qcoding_system_define_form)))
+ if (! SYMBOLP (object)
+ || NILP (Fget (object, Qcoding_system_define_form)))
return Qnil;
return Qt;
}
int id;
struct coding_detection_info detect_info;
enum coding_category base_category;
+ int null_byte_found = 0, eight_bit_found = 0;
if (NILP (coding_system))
coding_system = Qundecided;
struct coding_system *this;
int c, i;
+ coding.head_ascii = -1;
/* Skip all ASCII bytes except for a few ISO2022 controls. */
- for (i = 0; src < src_end; i++, src++)
+ for (; src < src_end; src++)
{
c = *src;
if (c & 0x80)
- break;
- if (c < 0x20
- && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
- && ! inhibit_iso_escape_detection)
{
- coding.head_ascii = src - coding.source;
- if (detect_coding_iso_2022 (&coding, &detect_info))
+ eight_bit_found = 1;
+ if (coding.head_ascii < 0)
+ coding.head_ascii = src - coding.source;
+ if (null_byte_found)
+ break;
+ }
+ if (c < 0x20)
+ {
+ if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ && ! inhibit_iso_escape_detection
+ && ! detect_info.checked)
{
- /* We have scanned the whole data. */
- if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
- /* We didn't find an 8-bit code. */
- src = src_end;
- break;
+ if (coding.head_ascii < 0)
+ coding.head_ascii = src - coding.source;
+ if (detect_coding_iso_2022 (&coding, &detect_info))
+ {
+ /* We have scanned the whole data. */
+ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
+ /* We didn't find an 8-bit code. We may have
+ found a null-byte, but it's very rare that
+ a binary file confirm to ISO-2022. */
+ src = src_end;
+ break;
+ }
+ }
+ else if (! c)
+ {
+ null_byte_found = 1;
+ if (eight_bit_found)
+ break;
}
}
}
- coding.head_ascii = src - coding.source;
+ if (coding.head_ascii < 0)
+ coding.head_ascii = src - coding.source;
- if (src < src_end
+ if (null_byte_found || eight_bit_found
+ || coding.head_ascii < coding.src_bytes
|| detect_info.found)
{
- if (src == src_end)
+ if (coding.head_ascii == coding.src_bytes)
/* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
for (i = 0; i < coding_category_raw_text; i++)
{
break;
}
else
- for (i = 0; i < coding_category_raw_text; i++)
- {
- category = coding_priorities[i];
- this = coding_categories + category;
+ {
+ if (null_byte_found)
+ {
+ detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+ detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
+ }
+ for (i = 0; i < coding_category_raw_text; i++)
+ {
+ category = coding_priorities[i];
+ this = coding_categories + category;
- if (this->id < 0)
- {
- /* No coding system of this category is defined. */
- detect_info.rejected |= (1 << category);
- }
- else if (category >= coding_category_raw_text)
- continue;
- else if (detect_info.checked & (1 << category))
- {
- if (highest
- && (detect_info.found & (1 << category)))
- break;
- }
- else
- {
- if ((*(this->detector)) (&coding, &detect_info)
- && highest
- && (detect_info.found & (1 << category)))
- {
- if (category == coding_category_utf_16_auto)
- {
- if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
- category = coding_category_utf_16_le;
- else
- category = coding_category_utf_16_be;
- }
+ if (this->id < 0)
+ {
+ /* No coding system of this category is defined. */
+ detect_info.rejected |= (1 << category);
+ }
+ else if (category >= coding_category_raw_text)
+ continue;
+ else if (detect_info.checked & (1 << category))
+ {
+ if (highest
+ && (detect_info.found & (1 << category)))
break;
- }
- }
- }
+ }
+ else if ((*(this->detector)) (&coding, &detect_info)
+ && highest
+ && (detect_info.found & (1 << category)))
+ {
+ if (category == coding_category_utf_16_auto)
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+ category = coding_category_utf_16_le;
+ else
+ category = coding_category_utf_16_be;
+ }
+ break;
+ }
+ }
+ }
}
- if (detect_info.rejected == CATEGORY_MASK_ANY)
+ if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
{
detect_info.found = CATEGORY_MASK_RAW_TEXT;
id = coding_categories[coding_category_raw_text].id;
if (VECTORP (eol_type))
{
if (detect_info.found & ~CATEGORY_MASK_UTF_16)
- normal_eol = detect_eol (coding.source, src_bytes,
- coding_category_raw_text);
+ {
+ if (null_byte_found)
+ normal_eol = EOL_SEEN_LF;
+ else
+ normal_eol = detect_eol (coding.source, src_bytes,
+ coding_category_raw_text);
+ }
if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
| CATEGORY_MASK_UTF_16_BE_NOSIG))
utf_16_be_eol = detect_eol (coding.source, src_bytes,
Return a list of possible coding systems ordered by priority.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Return a list of possible coding systems ordered by priority.
If only ASCII characters are found (except for such ISO-2022 control
-characters ISO-2022 as ESC), it returns a list of single element
-`undecided' or its subsidiary coding system according to a detected
-end-of-line format.
+characters as ESC), it returns a list of single element `undecided'
+or its subsidiary coding system according to a detected end-of-line
+format.
If optional argument HIGHEST is non-nil, return the coding system of
highest priority. */)
Sunencodable_char_position, 3, 5, 0,
doc: /*
Return position of first un-encodable character in a region.
-START and END specfiy the region and CODING-SYSTEM specifies the
+START and END specify the region and CODING-SYSTEM specifies the
encoding to check. Return nil if CODING-SYSTEM does encode the region.
If optional 4th argument COUNT is non-nil, it specifies at most how
CODING-SYSTEM-LIST is a list of coding systems to check.
The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
-CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
+CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
whole region, POS0, POS1, ... are buffer positions where non-encodable
characters are found.
DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3, 4, "r\nzCoding system: ",
doc: /* Encode the current region by specified coding system.
-When called from a program, takes three arguments:
-START, END, and CODING-SYSTEM. START and END are buffer positions.
+When called from a program, takes four arguments:
+ START, END, CODING-SYSTEM and DESTINATION.
+START and END are buffer positions.
Optional 4th arguments DESTINATION specifies where the encoded text goes.
If nil, the region between START and END is replace by the encoded text.
Optional third arg NOCOPY non-nil means it is OK to return STRING itself
if the decoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the decoded text is
+Optional fourth arg BUFFER non-nil means that the decoded text is
inserted in BUFFER instead of returned as a string. In this case,
the return value is BUFFER.
This function sets `last-coding-system-used' to the precise coding system
used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
-not fully specified. */)
+not fully specified.) */)
(string, coding_system, nocopy, buffer)
Lisp_Object string, coding_system, nocopy, buffer;
{
Optional third arg NOCOPY non-nil means it is OK to return STRING
itself if the encoding operation is trivial.
-Optional fourth arg BUFFER non-nil meant that the encoded text is
+Optional fourth arg BUFFER non-nil means that the encoded text is
inserted in BUFFER instead of returned as a string. In this case,
the return value is BUFFER.
TARGET has a meaning which depends on OPERATION:
For file I/O, TARGET is a file name (except for the special case below).
For process I/O, TARGET is a process name.
- For network I/O, TARGET is a service name or a port number
+ For network I/O, TARGET is a service name or a port number.
-This function looks up what specified for TARGET in,
+This function looks up what is specified for TARGET in
`file-coding-system-alist', `process-coding-system-alist',
or `network-coding-system-alist' depending on OPERATION.
They may specify a coding system, a cons of coding systems,
operation = args[0];
if (!SYMBOLP (operation)
|| !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
- error ("Invalid first arguement");
+ error ("Invalid first argument");
if (nargs < 1 + XINT (target_idx))
error ("Too few arguments for operation: %s",
SDATA (SYMBOL_NAME (operation)));
DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
Sset_coding_system_priority, 0, MANY, 0,
doc: /* Assign higher priority to the coding systems given as arguments.
-If multiple coding systems belongs to the same category,
+If multiple coding systems belong to the same category,
all but the first one are ignored.
-usage: (set-coding-system-priority ...) */)
+usage: (set-coding-system-priority &rest coding-systems) */)
(nargs, args)
int nargs;
Lisp_Object *args;
= Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
CODING_ATTR_PLIST (attrs)));
CODING_ATTR_PLIST (attrs)
- = Fcons (QCascii_compatible_p,
+ = Fcons (QCascii_compatible_p,
Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
CODING_ATTR_PLIST (attrs)));
CHECK_SYMBOL (alias);
CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
aliases = AREF (spec, 1);
- /* ALISES should be a list of length more than zero, and the first
+ /* ALIASES should be a list of length more than zero, and the first
element is a base coding system. Append ALIAS at the tail of the
list. */
while (!NILP (XCDR (aliases)))
DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
Scoding_system_eol_type, 1, 1, 0,
doc: /* Return eol-type of CODING-SYSTEM.
-An eol-type is integer 0, 1, 2, or a vector of coding systems.
+An eol-type is an integer 0, 1, 2, or a vector of coding systems.
Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
and CR respectively.