Fix the setting of buffer-file-coding-system on, for instance, C-x RET c unix RET...

author Kenichi Handa <handa@gnu.org>

Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)

committer Kenichi Handa <handa@gnu.org>

Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)
author Kenichi Handa <handa@gnu.org>
Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)
committer Kenichi Handa <handa@gnu.org>
Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)
diff --git a/src/ChangeLog b/src/ChangeLog

index b05ee5efd9de7bd85a66c8ac6a2620efe5674f1e..cc6194ac4677af58064a0c7c09a4f8eb42e76e19 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,24 @@
+2013-05-22  Kenichi Handa  <handa@gnu.org>
+
+       The following changes are to fix the setting of
+       buffer-file-coding-system on, for instance, C-x RET c unix RET
+       _FILE_OF_DOS_EOL_TYPE_ RET.
+
+       * coding.h (struct coding_system): New member detected_utf8_chars.
+
+       * coding.c (detect_coding_utf_8): Count characters and check EOL
+       format.  Include CATEGORY_MASK_UTF_8_AUTO in detect_info->found if
+       BOM is there.
+       (setup_coding_system): Do not initialize coding->head_ascii.
+       (check_ascii): Do not set coding->eol_seen but update it.  Do not
+       call adjust_coding_eol_type here.
+       (detect_coding): Fix detection of BOM for utf-8 and utf-16.  If
+       the eol-type of CODING is already specified, adjust the eol type
+       of the found coding-system.
+       (decode_coding_gap): Cancel previous change.  Utilize the
+       character numbers counted by detect_coding_utf_8.  Fix detection
+       of BOM for utf-8.
+
  2013-05-08  Paul Eggert  <eggert@cs.ucla.edu>
  
         * xterm.h (GTK_PREREQ): New macro.
diff --git a/src/coding.c b/src/coding.c

index f6664e179b7271295e2a24171c565c8d2d2e67b2..42fd81b6322507001fcc2cbfda10d10067897bb1 100644 (file)
--- a/src/coding.c
+++ b/src/coding.c
@@ -1125,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
      *buf++ = id;                                                       \
    } while (0)
  
+
+/* Bitmasks for coding->eol_seen.  */
+
+#define EOL_SEEN_NONE  0
+#define EOL_SEEN_LF    1
+#define EOL_SEEN_CR    2
+#define EOL_SEEN_CRLF  4
+
  \f
  /*** 2. Emacs' internal format (emacs-utf-8) ***/
  
@@ -1147,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
  #define UTF_8_BOM_2 0xBB
  #define UTF_8_BOM_3 0xBF
  
+/* Unlike the other detect_coding_XXX, this function counts number of
+   characters and check EOL format.  */
+
  static bool
  detect_coding_utf_8 (struct coding_system *coding,
                      struct coding_detection_info *detect_info)
@@ -1156,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
    bool multibytep = coding->src_multibyte;
    ptrdiff_t consumed_chars = 0;
    bool bom_found = 0;
-  bool found = 0;
+  int nchars = coding->head_ascii;
+  int eol_seen = coding->eol_seen;
  
    detect_info->checked |= CATEGORY_MASK_UTF_8;
    /* A coding system of this category is always ASCII compatible.  */
-  src += coding->head_ascii;
+  src += nchars;
+
+  if (src == coding->source    /* BOM should be at the head.  */
+      && src + 3 < src_end     /* BOM is 3-byte long.  */
+      && src[0] == UTF_8_BOM_1
+      && src[1] == UTF_8_BOM_2
+      && src[2] == UTF_8_BOM_3)
+    {
+      bom_found = 1;
+      src += 3;
+      nchars++;
+    }
  
    while (1)
      {
@@ -1169,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
        src_base = src;
        ONE_MORE_BYTE (c);
        if (c < 0 || UTF_8_1_OCTET_P (c))
-       continue;
+       {
+         nchars++;
+         if (c == '\r')
+           {
+             if (src < src_end && *src == '\n')
+               {
+                 eol_seen |= EOL_SEEN_CRLF;
+                 src++;
+                 nchars++;
+               }
+             else
+               eol_seen |= EOL_SEEN_CR;
+           }
+         else if (c == '\n')
+           eol_seen |= EOL_SEEN_LF;
+         continue;
+       }
        ONE_MORE_BYTE (c1);
        if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
         break;
        if (UTF_8_2_OCTET_LEADING_P (c))
         {
-         found = 1;
+         nchars++;
           continue;
         }
        ONE_MORE_BYTE (c2);
@@ -1183,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
         break;
        if (UTF_8_3_OCTET_LEADING_P (c))
         {
-         found = 1;
-         if (src_base == coding->source
-             && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
-           bom_found = 1;
+         nchars++;
           continue;
         }
        ONE_MORE_BYTE (c3);
@@ -1194,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
         break;
        if (UTF_8_4_OCTET_LEADING_P (c))
         {
-         found = 1;
+         nchars++;
           continue;
         }
        ONE_MORE_BYTE (c4);
@@ -1202,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
         break;
        if (UTF_8_5_OCTET_LEADING_P (c))
         {
-         found = 1;
+         nchars++;
           continue;
         }
        break;
@@ -1219,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
    if (bom_found)
      {
        /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
-      detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+      detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
      }
    else
      {
        detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
-      if (found)
-       detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+      if (nchars < src_end - coding->source)
+       /* The found characters are less than source bytes, which
+          means that we found a valid non-ASCII characters.  */
+       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
      }
+  coding->detected_utf8_chars = nchars;
    return 1;
  }
  
@@ -5622,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
    eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
  
    coding->mode = 0;
-  coding->head_ascii = -1;
    if (VECTORP (eol_type))
      coding->common_flags = (CODING_REQUIRE_DECODING_MASK
                             | CODING_REQUIRE_DETECTION_MASK);
@@ -6074,46 +6112,35 @@ complement_process_encoding_system (Lisp_Object coding_system)
  
  */
  
-#define EOL_SEEN_NONE  0
-#define EOL_SEEN_LF    1
-#define EOL_SEEN_CR    2
-#define EOL_SEEN_CRLF  4
-
-
  static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
                                            int eol_seen);
  
  
  /* Return the number of ASCII characters at the head of the source.
-   By side effects, set coding->head_ascii and coding->eol_seen.  The
-   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
-   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
-   all the source bytes are ASCII.  */
+   By side effects, set coding->head_ascii and update
+   coding->eol_seen.  The value of coding->eol_seen is "logical or" of
+   EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
+   reliable only when all the source bytes are ASCII.  */
  
  static int
  check_ascii (struct coding_system *coding)
  {
    const unsigned char *src, *end;
    Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
-  int eol_seen;
+  int eol_seen = coding->eol_seen;
  
-  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
-             : EQ (eol_type, Qunix) ? EOL_SEEN_LF
-             : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
-             : EOL_SEEN_CR);
    coding_set_source (coding);
    src = coding->source;
    end = src + coding->src_bytes;
  
    if (inhibit_eol_conversion
-      || eol_seen != EOL_SEEN_NONE)
+      || SYMBOLP (eol_type))
      {
        /* We don't have to check EOL format.  */
-      while (src < end && !( *src & 0x80)) src++;
-      if (inhibit_eol_conversion)
+      while (src < end && !( *src & 0x80))
         {
-         eol_seen = EOL_SEEN_LF;
-         adjust_coding_eol_type (coding, eol_seen);
+         if (*src++ == '\n')
+           eol_seen |= EOL_SEEN_LF;
         }
      }
    else
@@ -6171,7 +6198,7 @@ static int
  check_utf_8 (struct coding_system *coding)
  {
    const unsigned char *src, *end;
-  int eol_seen = coding->eol_seen;
+  int eol_seen;
    int nchars = coding->head_ascii;
  
    if (coding->head_ascii < 0)
@@ -6181,7 +6208,7 @@ check_utf_8 (struct coding_system *coding)
    src = coding->source + coding->head_ascii;
    /* We look ahead one byte for CR LF.  */
    end = coding->source + coding->src_bytes - 1;
-
+  eol_seen = coding->eol_seen;
    while (src < end)
      {
        int c = *src;
@@ -6402,6 +6429,8 @@ detect_coding (struct coding_system *coding)
  {
    const unsigned char *src, *src_end;
    unsigned int saved_mode = coding->mode;
+  Lisp_Object found = Qnil;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
  
    coding->consumed = coding->consumed_char = 0;
    coding->produced = coding->produced_char = 0;
@@ -6409,6 +6438,7 @@ detect_coding (struct coding_system *coding)
  
    src_end = coding->source + coding->src_bytes;
  
+  coding->eol_seen = EOL_SEEN_NONE;
    /* If we have not yet decided the text encoding type, detect it
       now.  */
    if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6418,7 +6448,6 @@ detect_coding (struct coding_system *coding)
        bool null_byte_found = 0, eight_bit_found = 0;
  
        coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
        detect_info.checked = detect_info.found = detect_info.rejected = 0;
        for (src = coding->source; src < src_end; src++)
         {
@@ -6529,32 +6558,58 @@ detect_coding (struct coding_system *coding)
                     }
                   else if ((*(this->detector)) (coding, &detect_info)
                            && detect_info.found & (1 << category))
-                   {
-                     if (category == coding_category_utf_16_auto)
-                       {
-                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-                           category = coding_category_utf_16_le;
-                         else
-                           category = coding_category_utf_16_be;
-                       }
-                     break;
-                   }
+                   break;
                 }
             }
  
           if (i < coding_category_raw_text)
-           setup_coding_system (CODING_ID_NAME (this->id), coding);
+           {
+             if (category == coding_category_utf_8_auto)
+               {
+                 Lisp_Object coding_systems;
+
+                 coding_systems = AREF (CODING_ID_ATTRS (this->id),
+                                        coding_attr_utf_bom);
+                 if (CONSP (coding_systems))
+                   {
+                     if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+                       found = XCAR (coding_systems);
+                     else
+                       found = XCDR (coding_systems);
+                   }
+                 else
+                   found = CODING_ID_NAME (this->id);
+               }
+             else if (category == coding_category_utf_16_auto)
+               {
+                 Lisp_Object coding_systems;
+
+                 coding_systems = AREF (CODING_ID_ATTRS (this->id),
+                                        coding_attr_utf_bom);
+                 if (CONSP (coding_systems))
+                   {
+                     if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+                       found = XCAR (coding_systems);
+                     else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
+                       found = XCDR (coding_systems);
+                   }
+                 else
+                   found = CODING_ID_NAME (this->id);
+               }
+             else
+               found = CODING_ID_NAME (this->id);
+           }
           else if (null_byte_found)
-           setup_coding_system (Qno_conversion, coding);
+           found = Qno_conversion;
           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
                    == CATEGORY_MASK_ANY)
-           setup_coding_system (Qraw_text, coding);
+           found = Qraw_text;
           else if (detect_info.rejected)
             for (i = 0; i < coding_category_raw_text; i++)
               if (! (detect_info.rejected & (1 << coding_priorities[i])))
                 {
                   this = coding_categories + coding_priorities[i];
-                 setup_coding_system (CODING_ID_NAME (this->id), coding);
+                 found = CODING_ID_NAME (this->id);
                   break;
                 }
         }
@@ -6570,12 +6625,8 @@ detect_coding (struct coding_system *coding)
        detect_info.found = detect_info.rejected = 0;
        if (check_ascii (coding) == coding->src_bytes)
         {
-         int head_ascii = coding->head_ascii;
-
-         if (coding->eol_seen != EOL_SEEN_NONE)
-           adjust_coding_eol_type (coding, coding->eol_seen);
-         setup_coding_system (XCDR (coding_systems), coding);
-         coding->head_ascii = head_ascii;
+         if (CONSP (coding_systems))
+           found = XCDR (coding_systems);
         }
        else
         {
@@ -6583,9 +6634,9 @@ detect_coding (struct coding_system *coding)
               && detect_coding_utf_8 (coding, &detect_info))
             {
               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
-               setup_coding_system (XCAR (coding_systems), coding);
+               found = XCAR (coding_systems);
               else
-               setup_coding_system (XCDR (coding_systems), coding);
+               found = XCDR (coding_systems);
             }
         }
      }
@@ -6599,16 +6650,28 @@ detect_coding (struct coding_system *coding)
         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
        detect_info.found = detect_info.rejected = 0;
        coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
        if (CONSP (coding_systems)
           && detect_coding_utf_16 (coding, &detect_info))
         {
           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-           setup_coding_system (XCAR (coding_systems), coding);
+           found = XCAR (coding_systems);
           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
-           setup_coding_system (XCDR (coding_systems), coding);
+           found = XCDR (coding_systems);
         }
      }
+
+  if (! NILP (found))
+    {
+      int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
+                          : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+                          : EQ (eol_type, Qmac) ? EOL_SEEN_CR
+                          : EOL_SEEN_LF);
+
+      setup_coding_system (found, coding);
+      if (specified_eol != EOL_SEEN_NONE)
+       adjust_coding_eol_type (coding, specified_eol);
+    }
+
    coding->mode = saved_mode;
  }
  
@@ -7729,6 +7792,9 @@ decode_coding_gap (struct coding_system *coding,
    coding->dst_pos_byte = PT_BYTE;
    coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
  
+  coding->head_ascii = -1;
+  coding->detected_utf8_chars = -1;
+  coding->eol_seen = EOL_SEEN_NONE;
    if (CODING_REQUIRE_DETECTION (coding))
      detect_coding (coding);
    attrs = CODING_ID_ATTRS (coding->id);
@@ -7743,17 +7809,38 @@ decode_coding_gap (struct coding_system *coding,
         chars = check_ascii (coding);
        if (chars != bytes)
         {
+         /* There exists a non-ASCII byte.  */
           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
-           chars = check_utf_8 (coding);
+           {
+             if (coding->detected_utf8_chars >= 0)
+               chars = coding->detected_utf8_chars;
+             else
+               chars = check_utf_8 (coding);
+             if (CODING_UTF_8_BOM (coding) != utf_without_bom
+                 && coding->head_ascii == 0
+                 && coding->source[0] == UTF_8_BOM_1
+                 && coding->source[1] == UTF_8_BOM_2
+                 && coding->source[2] == UTF_8_BOM_3)
+               {
+                 chars--;
+                 bytes -= 3;
+                 coding->src_bytes -= 3;
+               }
+           }
           else
             chars = -1;
         }
        if (chars >= 0)
         {
-         if (coding->eol_seen != EOL_SEEN_NONE)
-           adjust_coding_eol_type (coding, coding->eol_seen);
+         Lisp_Object eol_type;
  
-         if (coding->eol_seen == EOL_SEEN_CR)
+         eol_type = CODING_ID_EOL_TYPE (coding->id);
+         if (VECTORP (eol_type))
+           {
+             if (coding->eol_seen != EOL_SEEN_NONE)
+               eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
+           }
+         if (EQ (eol_type, Qmac))
             {
               unsigned char *src_end = GAP_END_ADDR;
               unsigned char *src = src_end - coding->src_bytes;
@@ -7764,7 +7851,7 @@ decode_coding_gap (struct coding_system *coding,
                     src[-1] = '\n';
                 }
             }
-         else if (coding->eol_seen == EOL_SEEN_CRLF)
+         else if (EQ (eol_type, Qdos))
             {
               unsigned char *src = GAP_END_ADDR;
               unsigned char *src_beg = src - coding->src_bytes;
diff --git a/src/coding.h b/src/coding.h

index d40209be68f03facbb54fff697b322173fa2b245..d13fd42fe4ff392daf689b0aebe414ceb0985a89 100644 (file)
--- a/src/coding.h
+++ b/src/coding.h
@@ -444,6 +444,8 @@ struct coding_system
       the eol format.  */
    ptrdiff_t head_ascii;
  
+  ptrdiff_t detected_utf8_chars;
+
    /* Used internally in coding.c.  See the comment of detect_ascii.  */
    int eol_seen;
author	Kenichi Handa <handa@gnu.org>
	Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)
committer	Kenichi Handa <handa@gnu.org>
	Wed, 22 May 2013 14:53:21 +0000 (23:53 +0900)
src/ChangeLog		patch \| blob \| history
src/coding.c		patch \| blob \| history
src/coding.h		patch \| blob \| history