X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/b35f288d478ef137a4d9e8e5a6a5f368a86b01f5..9f2554de935574cb1168b8de6fb3b38079bc4b80:/src/coding.c diff --git a/src/coding.c b/src/coding.c index bd298a2f00..e292f80859 100644 --- a/src/coding.c +++ b/src/coding.c @@ -314,7 +314,7 @@ Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; Lisp_Object Qbig, Qlittle; Lisp_Object Qcoding_system_history; Lisp_Object Qvalid_codes; -Lisp_Object QCcategory, QCmnemonic, QCdefalut_char; +Lisp_Object QCcategory, QCmnemonic, QCdefault_char; Lisp_Object QCdecode_translation_table, QCencode_translation_table; Lisp_Object QCpost_read_conversion, QCpre_write_conversion; Lisp_Object QCascii_compatible_p; @@ -546,6 +546,9 @@ enum iso_code_class_type character is prohibited by CODING_ISO_FLAG_SAFE. */ #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' +/* UTF-8 section */ +#define CODING_UTF_8_BOM(coding) \ + ((coding)->spec.utf_8_bom) /* UTF-16 section */ #define CODING_UTF_16_BOM(coding) \ @@ -576,7 +579,9 @@ enum coding_category coding_category_iso_8_2, coding_category_iso_7_else, coding_category_iso_8_else, - coding_category_utf_8, + coding_category_utf_8_auto, + coding_category_utf_8_nosig, + coding_category_utf_8_sig, coding_category_utf_16_auto, coding_category_utf_16_be, coding_category_utf_16_le, @@ -600,7 +605,9 @@ enum coding_category #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) -#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) +#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto) +#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig) +#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig) #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) @@ -622,7 +629,9 @@ enum coding_category | CATEGORY_MASK_ISO_8_2 \ | CATEGORY_MASK_ISO_7_ELSE \ | CATEGORY_MASK_ISO_8_ELSE \ - | CATEGORY_MASK_UTF_8 \ + | CATEGORY_MASK_UTF_8_AUTO \ + | CATEGORY_MASK_UTF_8_NOSIG \ + | CATEGORY_MASK_UTF_8_SIG \ | CATEGORY_MASK_UTF_16_AUTO \ | CATEGORY_MASK_UTF_16_BE \ | CATEGORY_MASK_UTF_16_LE \ @@ -662,6 +671,10 @@ enum coding_category | CATEGORY_MASK_UTF_16_BE_NOSIG \ | CATEGORY_MASK_UTF_16_LE_NOSIG) +#define CATEGORY_MASK_UTF_8 \ + (CATEGORY_MASK_UTF_8_AUTO \ + | CATEGORY_MASK_UTF_8_NOSIG \ + | CATEGORY_MASK_UTF_8_SIG) /* List of symbols `coding-category-xxx' ordered by priority. This variable is exposed to Emacs Lisp. */ @@ -1214,6 +1227,11 @@ alloc_destination (coding, nbytes, dst) #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) +#define UTF_BOM 0xFEFF +#define UTF_8_BOM_1 0xEF +#define UTF_8_BOM_2 0xBB +#define UTF_8_BOM_3 0xBF + static int detect_coding_utf_8 (coding, detect_info) struct coding_system *coding; @@ -1223,6 +1241,7 @@ detect_coding_utf_8 (coding, detect_info) const unsigned char *src_end = coding->source + coding->src_bytes; int multibytep = coding->src_multibyte; int consumed_chars = 0; + int bom_found = 0; int found = 0; detect_info->checked |= CATEGORY_MASK_UTF_8; @@ -1242,7 +1261,7 @@ detect_coding_utf_8 (coding, detect_info) break; if (UTF_8_2_OCTET_LEADING_P (c)) { - found = CATEGORY_MASK_UTF_8; + found = 1; continue; } ONE_MORE_BYTE (c2); @@ -1250,7 +1269,10 @@ detect_coding_utf_8 (coding, detect_info) break; if (UTF_8_3_OCTET_LEADING_P (c)) { - found = CATEGORY_MASK_UTF_8; + found = 1; + if (src_base == coding->source + && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3) + bom_found = 1; continue; } ONE_MORE_BYTE (c3); @@ -1258,7 +1280,7 @@ detect_coding_utf_8 (coding, detect_info) break; if (UTF_8_4_OCTET_LEADING_P (c)) { - found = CATEGORY_MASK_UTF_8; + found = 1; continue; } ONE_MORE_BYTE (c4); @@ -1266,7 +1288,7 @@ detect_coding_utf_8 (coding, detect_info) break; if (UTF_8_5_OCTET_LEADING_P (c)) { - found = CATEGORY_MASK_UTF_8; + found = 1; continue; } break; @@ -1280,7 +1302,17 @@ detect_coding_utf_8 (coding, detect_info) detect_info->rejected |= CATEGORY_MASK_UTF_8; return 0; } - detect_info->found |= found; + if (bom_found) + { + /* The first character 0xFFFE doesn't necessarily mean a BOM. */ + detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; + } + else + { + detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; + if (found) + detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; + } return 1; } @@ -1294,14 +1326,48 @@ decode_coding_utf_8 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; + enum utf_bom_type bom = CODING_UTF_8_BOM (coding); Lisp_Object attr, charset_list; int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos); int byte_after_cr = -1; CODING_GET_INFO (coding, attr, charset_list); + if (bom != utf_without_bom) + { + int c1, c2, c3; + + src_base = src; + ONE_MORE_BYTE (c1); + if (! UTF_8_3_OCTET_LEADING_P (c1)) + src = src_base; + else + { + ONE_MORE_BYTE (c2); + if (! UTF_8_EXTRA_OCTET_P (c2)) + src = src_base; + else + { + ONE_MORE_BYTE (c3); + if (! UTF_8_EXTRA_OCTET_P (c3)) + src = src_base; + else + { + if ((c1 != UTF_8_BOM_1) + || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3)) + src = src_base; + else + CODING_UTF_8_BOM (coding) = utf_without_bom; + } + } + } + } + CODING_UTF_8_BOM (coding) = utf_without_bom; + + + while (1) { int c, c1, c2, c3, c4, c5; @@ -1415,6 +1481,13 @@ encode_coding_utf_8 (coding) int produced_chars = 0; int c; + if (CODING_UTF_8_BOM (coding) == utf_with_bom) + { + ASSURE_DESTINATION (3); + EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3); + CODING_UTF_8_BOM (coding) = utf_without_bom; + } + if (multibytep) { int safe_room = MAX_MULTIBYTE_LENGTH * 2; @@ -1564,9 +1637,9 @@ decode_coding_utf_16 (coding) const unsigned char *src_base; int *charbuf = coding->charbuf + coding->charbuf_used; int *charbuf_end = coding->charbuf + coding->charbuf_size; - int consumed_chars = 0, consumed_chars_base; + int consumed_chars = 0, consumed_chars_base = 0; int multibytep = coding->src_multibyte; - enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); + enum utf_bom_type bom = CODING_UTF_16_BOM (coding); enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); int surrogate = CODING_UTF_16_SURROGATE (coding); Lisp_Object attr, charset_list; @@ -1575,7 +1648,7 @@ decode_coding_utf_16 (coding) CODING_GET_INFO (coding, attr, charset_list); - if (bom == utf_16_with_bom) + if (bom == utf_with_bom) { int c, c1, c2; @@ -1592,13 +1665,13 @@ decode_coding_utf_16 (coding) src = src_base; coding->errors++; } - CODING_UTF_16_BOM (coding) = utf_16_without_bom; + CODING_UTF_16_BOM (coding) = utf_without_bom; } - else if (bom == utf_16_detect_bom) + else if (bom == utf_detect_bom) { /* We have already tried to detect BOM and failed in detect_coding. */ - CODING_UTF_16_BOM (coding) = utf_16_without_bom; + CODING_UTF_16_BOM (coding) = utf_without_bom; } while (1) @@ -1688,7 +1761,7 @@ encode_coding_utf_16 (coding) unsigned char *dst = coding->destination + coding->produced; unsigned char *dst_end = coding->destination + coding->dst_bytes; int safe_room = 8; - enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); + enum utf_bom_type bom = CODING_UTF_16_BOM (coding); int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; int produced_chars = 0; Lisp_Object attrs, charset_list; @@ -1696,14 +1769,14 @@ encode_coding_utf_16 (coding) CODING_GET_INFO (coding, attrs, charset_list); - if (bom != utf_16_without_bom) + if (bom != utf_without_bom) { ASSURE_DESTINATION (safe_room); if (big_endian) EMIT_TWO_BYTES (0xFE, 0xFF); else EMIT_TWO_BYTES (0xFF, 0xFE); - CODING_UTF_16_BOM (coding) = utf_16_without_bom; + CODING_UTF_16_BOM (coding) = utf_without_bom; } while (charbuf < charbuf_end) @@ -2376,8 +2449,10 @@ encode_coding_emacs_mule (coding) if (preferred_charset_id >= 0) { charset = CHARSET_FROM_ID (preferred_charset_id); - if (! CHAR_CHARSET_P (c, charset)) - charset = char_charset (c, charset_list, NULL); + if (CHAR_CHARSET_P (c, charset)) + code = ENCODE_CHAR (charset, c); + else + charset = char_charset (c, charset_list, &code); } else charset = char_charset (c, charset_list, &code); @@ -2685,6 +2760,7 @@ detect_coding_iso_2022 (coding, detect_info) int i; int rejected = 0; int found = 0; + int composition_count = -1; detect_info->checked |= CATEGORY_MASK_ISO; @@ -2753,10 +2829,20 @@ detect_coding_iso_2022 (coding, detect_info) rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; break; } + else if (c == '1') + { + /* End of composition. */ + if (composition_count < 0 + || composition_count > MAX_COMPOSITION_COMPONENTS) + /* Invalid */ + break; + composition_count = -1; + found |= CATEGORY_MASK_ISO; + } else if (c >= '0' && c <= '4') { /* ESC for start/end composition. */ - found |= CATEGORY_MASK_ISO; + composition_count = 0; break; } else @@ -2827,6 +2913,8 @@ detect_coding_iso_2022 (coding, detect_info) continue; if (c < 0x80) { + if (composition_count >= 0) + composition_count++; single_shifting = 0; break; } @@ -2851,9 +2939,17 @@ detect_coding_iso_2022 (coding, detect_info) } if (i & 1 && src < src_end) - rejected |= CATEGORY_MASK_ISO_8_2; + { + rejected |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i; + } else - found |= CATEGORY_MASK_ISO_8_2; + { + found |= CATEGORY_MASK_ISO_8_2; + if (composition_count >= 0) + composition_count += i / 2; + } } break; } @@ -2970,6 +3066,8 @@ detect_coding_iso_2022 (coding, detect_info) break; \ if (p == src_end - 1) \ { \ + if (coding->mode & CODING_MODE_LAST_BLOCK) \ + goto invalid_code; \ /* The current composition doesn't end in the current \ source. */ \ record_conversion_result \ @@ -3117,10 +3215,15 @@ decode_coding_iso_2022 (coding) if (composition_state == COMPOSING_RULE || composition_state == COMPOSING_COMPONENT_RULE) { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; + if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1) + { + DECODE_COMPOSITION_RULE (c1); + components[component_idx++] = c1; + composition_state--; + continue; + } + /* Too long composition. */ + MAYBE_FINISH_COMPOSITION (); } } if (charset_id_0 < 0 @@ -3137,10 +3240,14 @@ decode_coding_iso_2022 (coding) if (composition_state == COMPOSING_RULE || composition_state == COMPOSING_COMPONENT_RULE) { - DECODE_COMPOSITION_RULE (c1); - components[component_idx++] = c1; - composition_state--; - continue; + if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1) + { + DECODE_COMPOSITION_RULE (c1); + components[component_idx++] = c1; + composition_state--; + continue; + } + MAYBE_FINISH_COMPOSITION (); } } if (charset_id_0 < 0) @@ -3498,11 +3605,20 @@ decode_coding_iso_2022 (coding) } else { - components[component_idx++] = c; - if (method == COMPOSITION_WITH_RULE - || (method == COMPOSITION_WITH_RULE_ALTCHARS - && composition_state == COMPOSING_COMPONENT_CHAR)) - composition_state++; + if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1) + { + components[component_idx++] = c; + if (method == COMPOSITION_WITH_RULE + || (method == COMPOSITION_WITH_RULE_ALTCHARS + && composition_state == COMPOSING_COMPONENT_CHAR)) + composition_state++; + } + else + { + MAYBE_FINISH_COMPOSITION (); + *charbuf++ = c; + char_offset++; + } } continue; @@ -4873,7 +4989,6 @@ encode_coding_raw_text (coding) *dst++ = CHAR_TO_BYTE8 (c); else CHAR_STRING_ADVANCE (c, dst); - produced_chars++; } } else @@ -4881,8 +4996,8 @@ encode_coding_raw_text (coding) ASSURE_DESTINATION (charbuf_end - charbuf); while (charbuf < charbuf_end && dst < dst_end) *dst++ = *charbuf++; - produced_chars = dst - (coding->destination + coding->dst_bytes); } + produced_chars = dst - (coding->destination + coding->produced); } record_conversion_result (coding, CODING_RESULT_SUCCESS); coding->produced_char += produced_chars; @@ -4903,16 +5018,20 @@ detect_coding_charset (coding, detect_info) const unsigned char *src_end = coding->source + coding->src_bytes; int multibytep = coding->src_multibyte; int consumed_chars = 0; - Lisp_Object attrs, valids; + Lisp_Object attrs, valids, name; int found = 0; int head_ascii = coding->head_ascii; + int check_latin_extra = 0; detect_info->checked |= CATEGORY_MASK_CHARSET; coding = &coding_categories[coding_category_charset]; attrs = CODING_ID_ATTRS (coding->id); valids = AREF (attrs, coding_attr_charset_valids); - + name = CODING_ID_NAME (coding->id); + if (VECTORP (Vlatin_extra_code_table) + && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-")) + check_latin_extra = 1; if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) src += head_ascii; @@ -4931,7 +5050,13 @@ detect_coding_charset (coding, detect_info) if (NILP (val)) break; if (c >= 0x80) - found = CATEGORY_MASK_CHARSET; + { + if (c < 0xA0 + && check_latin_extra + && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) + break; + found = CATEGORY_MASK_CHARSET; + } if (INTEGERP (val)) { charset = CHARSET_FROM_ID (XFASTINT (val)); @@ -5037,7 +5162,7 @@ decode_coding_charset (coding) code = c; val = AREF (valids, c); - if (NILP (val)) + if (! INTEGERP (val) && ! CONSP (val)) goto invalid_code; if (INTEGERP (val)) { @@ -5273,18 +5398,24 @@ setup_coding_system (coding_system, coding) } else if (EQ (coding_type, Qutf_8)) { + val = AREF (attrs, coding_attr_utf_bom); + CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom + : EQ (val, Qt) ? utf_with_bom + : utf_without_bom); coding->detector = detect_coding_utf_8; coding->decoder = decode_coding_utf_8; coding->encoder = encode_coding_utf_8; coding->common_flags |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); + if (CODING_UTF_8_BOM (coding) == utf_detect_bom) + coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; } else if (EQ (coding_type, Qutf_16)) { - val = AREF (attrs, coding_attr_utf_16_bom); - CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom - : EQ (val, Qt) ? utf_16_with_bom - : utf_16_without_bom); + val = AREF (attrs, coding_attr_utf_bom); + CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom + : EQ (val, Qt) ? utf_with_bom + : utf_without_bom); val = AREF (attrs, coding_attr_utf_16_endian); CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian : utf_16_little_endian); @@ -5294,7 +5425,7 @@ setup_coding_system (coding_system, coding) coding->encoder = encode_coding_utf_16; coding->common_flags |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); - if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom) + if (CODING_UTF_16_BOM (coding) == utf_detect_bom) coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; } else if (EQ (coding_type, Qccl)) @@ -5705,6 +5836,7 @@ detect_coding (coding) coding_set_source (coding); src_end = coding->source + coding->src_bytes; + coding->head_ascii = 0; /* If we have not yet decided the text encoding type, detect it now. */ @@ -5715,15 +5847,12 @@ detect_coding (coding) int null_byte_found = 0, eight_bit_found = 0; detect_info.checked = detect_info.found = detect_info.rejected = 0; - coding->head_ascii = -1; for (src = coding->source; src < src_end; src++) { c = *src; if (c & 0x80) { eight_bit_found = 1; - if (coding->head_ascii < 0) - coding->head_ascii = src - coding->source; if (null_byte_found) break; } @@ -5733,16 +5862,19 @@ detect_coding (coding) && ! inhibit_iso_escape_detection && ! detect_info.checked) { - if (coding->head_ascii < 0) - coding->head_ascii = src - coding->source; if (detect_coding_iso_2022 (coding, &detect_info)) { /* We have scanned the whole data. */ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) - /* We didn't find an 8-bit code. We may have - found a null-byte, but it's very rare that - a binary file confirm to ISO-2022. */ - src = src_end; + { + /* We didn't find an 8-bit code. We may + have found a null-byte, but it's very + rare that a binary file confirm to + ISO-2022. */ + src = src_end; + coding->head_ascii = src - coding->source; + } + detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE; break; } } @@ -5752,10 +5884,12 @@ detect_coding (coding) if (eight_bit_found) break; } + if (! eight_bit_found) + coding->head_ascii++; } + else if (! eight_bit_found) + coding->head_ascii++; } - if (coding->head_ascii < 0) - coding->head_ascii = src - coding->source; if (null_byte_found || eight_bit_found || coding->head_ascii < coding->src_bytes @@ -5809,23 +5943,42 @@ detect_coding (coding) break; } } - - if (i < coding_category_raw_text) - setup_coding_system (CODING_ID_NAME (this->id), coding); - else if (null_byte_found) - setup_coding_system (Qno_conversion, coding); - else if ((detect_info.rejected & CATEGORY_MASK_ANY) - == CATEGORY_MASK_ANY) - setup_coding_system (Qraw_text, coding); - else if (detect_info.rejected) - for (i = 0; i < coding_category_raw_text; i++) - if (! (detect_info.rejected & (1 << coding_priorities[i]))) - { - this = coding_categories + coding_priorities[i]; - setup_coding_system (CODING_ID_NAME (this->id), coding); - break; - } } + + if (i < coding_category_raw_text) + setup_coding_system (CODING_ID_NAME (this->id), coding); + else if (null_byte_found) + setup_coding_system (Qno_conversion, coding); + else if ((detect_info.rejected & CATEGORY_MASK_ANY) + == CATEGORY_MASK_ANY) + setup_coding_system (Qraw_text, coding); + else if (detect_info.rejected) + for (i = 0; i < coding_category_raw_text; i++) + if (! (detect_info.rejected & (1 << coding_priorities[i]))) + { + this = coding_categories + coding_priorities[i]; + setup_coding_system (CODING_ID_NAME (this->id), coding); + break; + } + } + } + else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) + == coding_category_utf_8_auto) + { + Lisp_Object coding_systems; + struct coding_detection_info detect_info; + + coding_systems + = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); + detect_info.found = detect_info.rejected = 0; + coding->head_ascii = 0; + if (CONSP (coding_systems) + && detect_coding_utf_8 (coding, &detect_info)) + { + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + setup_coding_system (XCAR (coding_systems), coding); + else + setup_coding_system (XCDR (coding_systems), coding); } } else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) @@ -5835,8 +5988,9 @@ detect_coding (coding) struct coding_detection_info detect_info; coding_systems - = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom); + = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); detect_info.found = detect_info.rejected = 0; + coding->head_ascii = 0; if (CONSP (coding_systems) && detect_coding_utf_16 (coding, &detect_info)) { @@ -6178,7 +6332,7 @@ produce_chars (coding, translation_table, last_block) if (coding->src_multibyte) { int multibytep = 1; - EMACS_INT consumed_chars; + EMACS_INT consumed_chars = 0; while (1) { @@ -6871,13 +7025,17 @@ make_conversion_work_buffer (multibyte) } else { - name = Vcode_conversion_workbuf_name; - workbuf = Fget_buffer_create (name); - if (NILP (Vcode_conversion_reused_workbuf)) - Vcode_conversion_reused_workbuf = workbuf; + if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf))) + Vcode_conversion_reused_workbuf + = Fget_buffer_create (Vcode_conversion_workbuf_name); + workbuf = Vcode_conversion_reused_workbuf; } current = current_buffer; set_buffer_internal (XBUFFER (workbuf)); + /* We can't allow modification hooks to run in the work buffer. For + instance, directory_files_internal assumes that file decoding + doesn't compile new regexps. */ + Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt); Ferase_buffer (); current_buffer->undo_list = Qt; current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil; @@ -7332,8 +7490,13 @@ encode_coding_object (coding, src_object, from, from_byte, to, to_byte, } else { - coding->dst_pos = BUF_PT (XBUFFER (dst_object)); - coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object)); + struct buffer *current = current_buffer; + + set_buffer_temp (XBUFFER (dst_object)); + coding->dst_pos = PT; + coding->dst_pos_byte = PT_BYTE; + move_gap_both (coding->dst_pos, coding->dst_pos_byte); + set_buffer_temp (current); } coding->dst_multibyte = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters); @@ -7434,14 +7597,14 @@ DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, doc: /* Return t if OBJECT is nil or a coding-system. See the documentation of `define-coding-system' for information about coding-system objects. */) - (obj) - Lisp_Object obj; + (object) + Lisp_Object object; { - if (NILP (obj) - || CODING_SYSTEM_ID (obj) >= 0) + if (NILP (object) + || CODING_SYSTEM_ID (object) >= 0) return Qt; - if (! SYMBOLP (obj) - || NILP (Fget (obj, Qcoding_system_define_form))) + if (! SYMBOLP (object) + || NILP (Fget (object, Qcoding_system_define_form))) return Qnil; return Qt; } @@ -7533,7 +7696,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { const unsigned char *src_end = src + src_bytes; Lisp_Object attrs, eol_type; - Lisp_Object val; + Lisp_Object val = Qnil; struct coding_system coding; int id; struct coding_detection_info detect_info; @@ -7553,6 +7716,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, coding.src_multibyte = multibytep; coding.consumed = 0; coding.mode |= CODING_MODE_LAST_BLOCK; + coding.head_ascii = 0; detect_info.checked = detect_info.found = detect_info.rejected = 0; @@ -7564,7 +7728,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, struct coding_system *this; int c, i; - coding.head_ascii = -1; /* Skip all ASCII bytes except for a few ISO2022 controls. */ for (; src < src_end; src++) { @@ -7572,27 +7735,28 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, if (c & 0x80) { eight_bit_found = 1; - if (coding.head_ascii < 0) - coding.head_ascii = src - coding.source; if (null_byte_found) break; } - if (c < 0x20) + else if (c < 0x20) { if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) && ! inhibit_iso_escape_detection && ! detect_info.checked) { - if (coding.head_ascii < 0) - coding.head_ascii = src - coding.source; if (detect_coding_iso_2022 (&coding, &detect_info)) { /* We have scanned the whole data. */ if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) - /* We didn't find an 8-bit code. We may have - found a null-byte, but it's very rare that - a binary file confirm to ISO-2022. */ - src = src_end; + { + /* We didn't find an 8-bit code. We may + have found a null-byte, but it's very + rare that a binary file confirm to + ISO-2022. */ + src = src_end; + coding.head_ascii = src - coding.source; + } + detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE; break; } } @@ -7602,10 +7766,12 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, if (eight_bit_found) break; } + if (! eight_bit_found) + coding.head_ascii++; } + else if (! eight_bit_found) + coding.head_ascii++; } - if (coding.head_ascii < 0) - coding.head_ascii = src - coding.source; if (null_byte_found || eight_bit_found || coding.head_ascii < coding.src_bytes @@ -7695,7 +7861,6 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, { int mask = detect_info.rejected | detect_info.found; int found = 0; - val = Qnil; for (i = coding_category_raw_text - 1; i >= 0; i--) { @@ -7720,6 +7885,19 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, detect_info.found |= found; } } + else if (base_category == coding_category_utf_8_auto) + { + if (detect_coding_utf_8 (&coding, &detect_info)) + { + struct coding_system *this; + + if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) + this = coding_categories + coding_category_utf_8_sig; + else + this = coding_categories + coding_category_utf_8_nosig; + val = Fcons (make_number (this->id), Qnil); + } + } else if (base_category == coding_category_utf_16_auto) { if (detect_coding_utf_16 (&coding, &detect_info)) @@ -7745,7 +7923,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, /* Then, detect eol-format if necessary. */ { - int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; + int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1; Lisp_Object tail; if (VECTORP (eol_type)) @@ -7811,7 +7989,7 @@ detect_coding_system (src, src_chars, src_bytes, highest, multibytep, } } - return (highest ? XCAR (val) : val); + return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val); } @@ -7821,9 +7999,9 @@ DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, Return a list of possible coding systems ordered by priority. If only ASCII characters are found (except for such ISO-2022 control -characters ISO-2022 as ESC), it returns a list of single element -`undecided' or its subsidiary coding system according to a detected -end-of-line format. +characters as ESC), it returns a list of single element `undecided' +or its subsidiary coding system according to a detected end-of-line +format. If optional argument HIGHEST is non-nil, return the coding system of highest priority. */) @@ -7858,9 +8036,9 @@ DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, Return a list of possible coding systems ordered by priority. If only ASCII characters are found (except for such ISO-2022 control -characters ISO-2022 as ESC), it returns a list of single element -`undecided' or its subsidiary coding system according to a detected -end-of-line format. +characters as ESC), it returns a list of single element `undecided' +or its subsidiary coding system according to a detected end-of-line +format. If optional argument HIGHEST is non-nil, return the coding system of highest priority. */) @@ -8028,7 +8206,7 @@ DEFUN ("unencodable-char-position", Funencodable_char_position, Sunencodable_char_position, 3, 5, 0, doc: /* Return position of first un-encodable character in a region. -START and END specfiy the region and CODING-SYSTEM specifies the +START and END specify the region and CODING-SYSTEM specifies the encoding to check. Return nil if CODING-SYSTEM does encode the region. If optional 4th argument COUNT is non-nil, it specifies at most how @@ -8141,7 +8319,7 @@ START and END are buffer positions specifying the region. CODING-SYSTEM-LIST is a list of coding systems to check. The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where -CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the +CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the whole region, POS0, POS1, ... are buffer positions where non-encodable characters are found. @@ -8310,13 +8488,14 @@ START and END are buffer positions. Optional 4th arguments DESTINATION specifies where the decoded text goes. If nil, the region between START and END is replaced by the decoded text. -If buffer, the decoded text is inserted in the buffer. -If t, the decoded text is returned. +If buffer, the decoded text is inserted in that buffer after point (point +does not move). +In those cases, the length of the decoded text is returned. +If DESTINATION is t, the decoded text is returned. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) -It returns the length of the decoded text. */) +not fully specified.) */) (start, end, coding_system, destination) Lisp_Object start, end, coding_system, destination; { @@ -8326,18 +8505,20 @@ It returns the length of the decoded text. */) DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, 3, 4, "r\nzCoding system: ", doc: /* Encode the current region by specified coding system. -When called from a program, takes three arguments: -START, END, and CODING-SYSTEM. START and END are buffer positions. +When called from a program, takes four arguments: + START, END, CODING-SYSTEM and DESTINATION. +START and END are buffer positions. Optional 4th arguments DESTINATION specifies where the encoded text goes. If nil, the region between START and END is replace by the encoded text. -If buffer, the encoded text is inserted in the buffer. -If t, the encoded text is returned. +If buffer, the encoded text is inserted in that buffer after point (point +does not move). +In those cases, the length of the encoded text is returned. +If DESTINATION is t, the encoded text is returned. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified.) -It returns the length of the encoded text. */) +not fully specified.) */) (start, end, coding_system, destination) Lisp_Object start, end, coding_system, destination; { @@ -8410,13 +8591,13 @@ DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, Optional third arg NOCOPY non-nil means it is OK to return STRING itself if the decoding operation is trivial. -Optional fourth arg BUFFER non-nil meant that the decoded text is -inserted in BUFFER instead of returned as a string. In this case, -the return value is BUFFER. +Optional fourth arg BUFFER non-nil means that the decoded text is +inserted in that buffer after point (point does not move). In this +case, the return value is the length of the decoded text. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is -not fully specified. */) +not fully specified.) */) (string, coding_system, nocopy, buffer) Lisp_Object string, coding_system, nocopy, buffer; { @@ -8431,9 +8612,9 @@ DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, Optional third arg NOCOPY non-nil means it is OK to return STRING itself if the encoding operation is trivial. -Optional fourth arg BUFFER non-nil meant that the encoded text is -inserted in BUFFER instead of returned as a string. In this case, -the return value is BUFFER. +Optional fourth arg BUFFER non-nil means that the encoded text is +inserted in that buffer after point (point does not move). In this +case, the return value is the length of the encoded text. This function sets `last-coding-system-used' to the precise coding system used (which may be different from CODING-SYSTEM if CODING-SYSTEM is @@ -8691,9 +8872,9 @@ whichever argument specifies the file name is TARGET. TARGET has a meaning which depends on OPERATION: For file I/O, TARGET is a file name (except for the special case below). For process I/O, TARGET is a process name. - For network I/O, TARGET is a service name or a port number + For network I/O, TARGET is a service name or a port number. -This function looks up what specified for TARGET in, +This function looks up what is specified for TARGET in `file-coding-system-alist', `process-coding-system-alist', or `network-coding-system-alist' depending on OPERATION. They may specify a coding system, a cons of coding systems, @@ -8785,10 +8966,10 @@ usage: (find-operation-coding-system OPERATION ARGUMENTS...) */) DEFUN ("set-coding-system-priority", Fset_coding_system_priority, Sset_coding_system_priority, 0, MANY, 0, doc: /* Assign higher priority to the coding systems given as arguments. -If multiple coding systems belongs to the same category, +If multiple coding systems belong to the same category, all but the first one are ignored. -usage: (set-coding-system-priority ...) */) +usage: (set-coding-system-priority &rest coding-systems) */) (nargs, args) int nargs; Lisp_Object *args; @@ -9149,7 +9330,7 @@ usage: (define-coding-system-internal ...) */) val = XCDR (bom); CHECK_CODING_SYSTEM (val); } - ASET (attrs, coding_attr_utf_16_bom, bom); + ASET (attrs, coding_attr_utf_bom, bom); endian = args[coding_arg_utf16_endian]; CHECK_SYMBOL (endian); @@ -9328,8 +9509,27 @@ usage: (define-coding-system-internal ...) */) } else if (EQ (coding_type, Qutf_8)) { - category = coding_category_utf_8; + Lisp_Object bom; + CODING_ATTR_ASCII_COMPAT (attrs) = Qt; + + if (nargs < coding_arg_utf8_max) + goto short_args; + + bom = args[coding_arg_utf8_bom]; + if (! NILP (bom) && ! EQ (bom, Qt)) + { + CHECK_CONS (bom); + val = XCAR (bom); + CHECK_CODING_SYSTEM (val); + val = XCDR (bom); + CHECK_CODING_SYSTEM (val); + } + ASET (attrs, coding_attr_utf_bom, bom); + + category = (CONSP (bom) ? coding_category_utf_8_auto + : NILP (bom) ? coding_category_utf_8_nosig + : coding_category_utf_8_sig); } else if (EQ (coding_type, Qundecided)) category = coding_category_undecided; @@ -9421,7 +9621,7 @@ DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put, CHECK_CHARACTER (val); CODING_ATTR_MNEMONIC (attrs) = val; } - else if (EQ (prop, QCdefalut_char)) + else if (EQ (prop, QCdefault_char)) { if (NILP (val)) val = make_number (' '); @@ -9473,7 +9673,7 @@ DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, CHECK_SYMBOL (alias); CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); aliases = AREF (spec, 1); - /* ALISES should be a list of length more than zero, and the first + /* ALIASES should be a list of length more than zero, and the first element is a base coding system. Append ALIAS at the tail of the list. */ while (!NILP (XCDR (aliases))) @@ -9551,7 +9751,7 @@ DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases, DEFUN ("coding-system-eol-type", Fcoding_system_eol_type, Scoding_system_eol_type, 1, 1, 0, doc: /* Return eol-type of CODING-SYSTEM. -An eol-type is integer 0, 1, 2, or a vector of coding systems. +An eol-type is an integer 0, 1, 2, or a vector of coding systems. Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF, and CR respectively. @@ -9727,7 +9927,7 @@ syms_of_coding () DEFSYM (QCcategory, ":category"); DEFSYM (QCmnemonic, ":mnemonic"); - DEFSYM (QCdefalut_char, ":default-char"); + DEFSYM (QCdefault_char, ":default-char"); DEFSYM (QCdecode_translation_table, ":decode-translation-table"); DEFSYM (QCencode_translation_table, ":encode-translation-table"); DEFSYM (QCpost_read_conversion, ":post-read-conversion"); @@ -9750,8 +9950,12 @@ syms_of_coding () intern ("coding-category-iso-7-else")); ASET (Vcoding_category_table, coding_category_iso_8_else, intern ("coding-category-iso-8-else")); - ASET (Vcoding_category_table, coding_category_utf_8, + ASET (Vcoding_category_table, coding_category_utf_8_auto, + intern ("coding-category-utf-8-auto")); + ASET (Vcoding_category_table, coding_category_utf_8_nosig, intern ("coding-category-utf-8")); + ASET (Vcoding_category_table, coding_category_utf_8_sig, + intern ("coding-category-utf-8-sig")); ASET (Vcoding_category_table, coding_category_utf_16_be, intern ("coding-category-utf-16-be")); ASET (Vcoding_category_table, coding_category_utf_16_auto, @@ -9829,7 +10033,7 @@ updated by the functions `define-coding-system' and DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, doc: /* Alist of coding system names. Each element is one element list of coding system name. -This variable is given to `completing-read' as TABLE argument. +This variable is given to `completing-read' as COLLECTION argument. Do not alter the value of this variable manually. This variable should be updated by the functions `make-coding-system' and @@ -9859,8 +10063,8 @@ Don't modify this variable directly, but use `set-coding-priority'. */); doc: /* Specify the coding system for read operations. It is useful to bind this variable with `let', but do not set it globally. If the value is a coding system, it is used for decoding on read operation. -If not, an appropriate element is used from one of the coding system alists: -There are three such tables, `file-coding-system-alist', +If not, an appropriate element is used from one of the coding system alists. +There are three such tables: `file-coding-system-alist', `process-coding-system-alist', and `network-coding-system-alist'. */); Vcoding_system_for_read = Qnil; @@ -9871,8 +10075,8 @@ If the value is a coding system, it is used for encoding of output, when writing it to a file and when sending it to a file or subprocess. If this does not specify a coding system, an appropriate element -is used from one of the coding system alists: -There are three such tables, `file-coding-system-alist', +is used from one of the coding system alists. +There are three such tables: `file-coding-system-alist', `process-coding-system-alist', and `network-coding-system-alist'. For output to files, if the above procedure does not specify a coding system, the value of `buffer-file-coding-system' is used. */); @@ -10032,7 +10236,7 @@ If Nth element is non-nil, the existence of code N in a file a coding system of ISO 2022 variant which has a flag `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file or reading output of a subprocess. -Only 128th through 159th elements has a meaning. */); +Only 128th through 159th elements have a meaning. */); Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); DEFVAR_LISP ("select-safe-coding-system-function", @@ -10088,8 +10292,8 @@ escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argumen DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, doc: /* Char table for translating self-inserting characters. -This is applied to the result of input methods, not their input. See also -`keyboard-translate-table'. */); +This is applied to the result of input methods, not their input. +See also `keyboard-translate-table'. */); Vtranslation_table_for_input = Qnil; {