]> code.delx.au - gnu-emacs/blobdiff - src/coding.c
Sync x-server-* and x-display-* functions on NS with those on X11.
[gnu-emacs] / src / coding.c
index c18632f301bab40258923a68e7c8c5df42e07b4f..f6664e179b7271295e2a24171c565c8d2d2e67b2 100644 (file)
@@ -3887,6 +3887,14 @@ decode_coding_iso_2022 (struct coding_system *coding)
       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
       char_offset++;
       coding->errors++;
+      /* Reset the invocation and designation status to the safest
+        one; i.e. designate ASCII to the graphic register 0, and
+        invoke that register to the graphic plane 0.  This typically
+        helps the case that an designation sequence for ASCII "ESC (
+        B" is somehow broken (e.g. broken by a newline).  */
+      CODING_ISO_INVOCATION (coding, 0) = 0;
+      CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
+      charset_id_0 = charset_ascii;
       continue;
 
     break_loop:
@@ -6071,6 +6079,188 @@ complement_process_encoding_system (Lisp_Object coding_system)
 #define EOL_SEEN_CR    2
 #define EOL_SEEN_CRLF  4
 
+
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
+                                          int eol_seen);
+
+
+/* Return the number of ASCII characters at the head of the source.
+   By side effects, set coding->head_ascii and coding->eol_seen.  The
+   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
+   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
+   all the source bytes are ASCII.  */
+
+static int
+check_ascii (struct coding_system *coding)
+{
+  const unsigned char *src, *end;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
+  int eol_seen;
+
+  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
+             : EQ (eol_type, Qunix) ? EOL_SEEN_LF
+             : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+             : EOL_SEEN_CR);
+  coding_set_source (coding);
+  src = coding->source;
+  end = src + coding->src_bytes;
+
+  if (inhibit_eol_conversion
+      || eol_seen != EOL_SEEN_NONE)
+    {
+      /* We don't have to check EOL format.  */
+      while (src < end && !( *src & 0x80)) src++;
+      if (inhibit_eol_conversion)
+       {
+         eol_seen = EOL_SEEN_LF;
+         adjust_coding_eol_type (coding, eol_seen);
+       }
+    }
+  else
+    {
+      end--;               /* We look ahead one byte for "CR LF".  */
+      while (src < end)
+       {
+         int c = *src;
+
+         if (c & 0x80)
+           break;
+         src++;
+         if (c == '\r')
+           {
+             if (*src == '\n')
+               {
+                 eol_seen |= EOL_SEEN_CRLF;
+                 src++;
+               }
+             else
+               eol_seen |= EOL_SEEN_CR;
+           }
+         else if (c == '\n')
+           eol_seen |= EOL_SEEN_LF;
+       }
+      if (src == end)
+       {
+         int c = *src;
+
+         /* All bytes but the last one C are ASCII.  */
+         if (! (c & 0x80))
+           {
+             if (c == '\r')
+               eol_seen |= EOL_SEEN_CR;
+             else if (c  == '\n')
+               eol_seen |= EOL_SEEN_LF;
+             src++;
+           }
+       }
+    }
+  coding->head_ascii = src - coding->source;
+  coding->eol_seen = eol_seen;
+  return (coding->head_ascii);
+}
+
+
+/* Return the number of characters at the source if all the bytes are
+   valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
+   effects, update coding->eol_seen.  The value of coding->eol_seen is
+   "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
+   the value is reliable only when all the source bytes are valid
+   UTF-8.  */
+
+static int
+check_utf_8 (struct coding_system *coding)
+{
+  const unsigned char *src, *end;
+  int eol_seen = coding->eol_seen;
+  int nchars = coding->head_ascii;
+
+  if (coding->head_ascii < 0)
+    check_ascii (coding);
+  else
+    coding_set_source (coding);
+  src = coding->source + coding->head_ascii;
+  /* We look ahead one byte for CR LF.  */
+  end = coding->source + coding->src_bytes - 1;
+
+  while (src < end)
+    {
+      int c = *src;
+
+      if (UTF_8_1_OCTET_P (*src))
+       {
+         src++;
+         if (c < 0x20)
+           {
+             if (c == '\r')
+               {
+                 if (*src == '\n')
+                   {
+                     eol_seen |= EOL_SEEN_CRLF;
+                     src++;
+                     nchars++;
+                   }
+                 else
+                   eol_seen |= EOL_SEEN_CR;
+               }
+             else if (c == '\n')
+               eol_seen |= EOL_SEEN_LF;
+           }
+       }
+      else if (UTF_8_2_OCTET_LEADING_P (c))
+       {
+         if (c < 0xC2          /* overlong sequence */
+             || src + 1 >= end
+             || ! UTF_8_EXTRA_OCTET_P (src[1]))
+           return -1;
+         src += 2;
+       }
+      else if (UTF_8_3_OCTET_LEADING_P (c))
+       {
+         if (src + 2 >= end
+             || ! (UTF_8_EXTRA_OCTET_P (src[1])
+                   && UTF_8_EXTRA_OCTET_P (src[2])))
+           return -1;
+         c = (((c & 0xF) << 12)
+              | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+         if (c < 0x800                       /* overlong sequence */
+             || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
+           return -1;
+         src += 3;
+       }
+      else if (UTF_8_4_OCTET_LEADING_P (c))
+       {
+         if (src + 3 >= end
+             || ! (UTF_8_EXTRA_OCTET_P (src[1])
+                   && UTF_8_EXTRA_OCTET_P (src[2])
+                   && UTF_8_EXTRA_OCTET_P (src[3])))
+           return -1;
+         c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
+              | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+         if (c < 0x10000       /* overlong sequence */
+             || c >= 0x110000) /* non-Unicode character  */
+           return -1;
+         src += 4;
+       }
+      else
+       return -1;
+      nchars++;
+    }
+
+  if (src == end)
+    {
+      if (! UTF_8_1_OCTET_P (*src))
+       return -1;
+      nchars++;
+      if (*src == '\r')
+       eol_seen |= EOL_SEEN_CR;
+      else if (*src  == '\n')
+       eol_seen |= EOL_SEEN_LF;
+    }
+  coding->eol_seen = eol_seen;
+  return nchars;
+}
+
+
 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
    SOURCE is encoded.  If CATEGORY is one of
    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6182,6 +6372,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
   Lisp_Object eol_type;
 
   eol_type = CODING_ID_EOL_TYPE (coding->id);
+  if (! VECTORP (eol_type))
+    /* Already adjusted.  */
+    return eol_type;
   if (eol_seen & EOL_SEEN_LF)
     {
       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6215,7 +6408,6 @@ detect_coding (struct coding_system *coding)
   coding_set_source (coding);
 
   src_end = coding->source + coding->src_bytes;
-  coding->head_ascii = 0;
 
   /* If we have not yet decided the text encoding type, detect it
      now.  */
@@ -6225,6 +6417,8 @@ detect_coding (struct coding_system *coding)
       struct coding_detection_info detect_info;
       bool null_byte_found = 0, eight_bit_found = 0;
 
+      coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
       detect_info.checked = detect_info.found = detect_info.rejected = 0;
       for (src = coding->source; src < src_end; src++)
        {
@@ -6263,6 +6457,27 @@ detect_coding (struct coding_system *coding)
                  if (eight_bit_found)
                    break;
                }
+             else if (! disable_ascii_optimization
+                      && ! inhibit_eol_conversion)
+               {
+                 if (c == '\r')
+                   {
+                     if (src < src_end && src[1] == '\n')
+                       {
+                         coding->eol_seen |= EOL_SEEN_CRLF;
+                         src++;
+                         if (! eight_bit_found)
+                           coding->head_ascii++;
+                       }
+                     else
+                       coding->eol_seen |= EOL_SEEN_CR;
+                   }
+                 else if (c == '\n')
+                   {
+                     coding->eol_seen |= EOL_SEEN_LF;
+                   }
+               }
+
              if (! eight_bit_found)
                coding->head_ascii++;
            }
@@ -6353,19 +6568,25 @@ detect_coding (struct coding_system *coding)
       coding_systems
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
       detect_info.found = detect_info.rejected = 0;
-      for (src = coding->source; src < src_end; src++)
+      if (check_ascii (coding) == coding->src_bytes)
        {
-         if (*src & 0x80)
-           break;
+         int head_ascii = coding->head_ascii;
+
+         if (coding->eol_seen != EOL_SEEN_NONE)
+           adjust_coding_eol_type (coding, coding->eol_seen);
+         setup_coding_system (XCDR (coding_systems), coding);
+         coding->head_ascii = head_ascii;
        }
-      coding->head_ascii = src - coding->source;
-      if (CONSP (coding_systems)
-         && detect_coding_utf_8 (coding, &detect_info))
+      else
        {
-         if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
-           setup_coding_system (XCAR (coding_systems), coding);
-         else
-           setup_coding_system (XCDR (coding_systems), coding);
+         if (CONSP (coding_systems)
+             && detect_coding_utf_8 (coding, &detect_info))
+           {
+             if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+               setup_coding_system (XCAR (coding_systems), coding);
+             else
+               setup_coding_system (XCDR (coding_systems), coding);
+           }
        }
     }
   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6378,6 +6599,7 @@ detect_coding (struct coding_system *coding)
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
       detect_info.found = detect_info.rejected = 0;
       coding->head_ascii = 0;
+      coding->eol_seen = EOL_SEEN_NONE;
       if (CONSP (coding_systems)
          && detect_coding_utf_16 (coding, &detect_info))
        {
@@ -6815,7 +7037,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
 
   produced = dst - (coding->destination + coding->produced);
   if (BUFFERP (coding->dst_object) && produced_chars > 0)
-    insert_from_gap (produced_chars, produced);
+    insert_from_gap (produced_chars, produced, 0);
   coding->produced += produced;
   coding->produced_char += produced_chars;
   return carryover;
@@ -7400,7 +7622,7 @@ encode_coding (struct coding_system *coding)
   } while (coding->consumed_char < coding->src_chars);
 
   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
-    insert_from_gap (coding->produced_char, coding->produced);
+    insert_from_gap (coding->produced_char, coding->produced, 0);
 
   SAFE_FREE ();
 }
@@ -7510,39 +7732,61 @@ decode_coding_gap (struct coding_system *coding,
   if (CODING_REQUIRE_DETECTION (coding))
     detect_coding (coding);
   attrs = CODING_ID_ATTRS (coding->id);
-#ifndef CODING_DISABLE_ASCII_OPTIMIZATION
-  if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
+  if (! disable_ascii_optimization
+      && ! coding->src_multibyte
+      && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
       && NILP (CODING_ATTR_POST_READ (attrs))
-      && NILP (get_translation_table (attrs, 0, NULL))
-      && (inhibit_eol_conversion
-         || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
+      && NILP (get_translation_table (attrs, 0, NULL)))
     {
-      /* We can skip the conversion if all source bytes are ASCII.  */
-      if (coding->head_ascii < 0)
+      chars = coding->head_ascii;
+      if (chars < 0)
+       chars = check_ascii (coding);
+      if (chars != bytes)
        {
-         /* We have not yet counted the number of ASCII bytes at the
-            head of the source.  Do it now.  */
-         const unsigned char *src, *src_end;
+         if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
+           chars = check_utf_8 (coding);
+         else
+           chars = -1;
+       }
+      if (chars >= 0)
+       {
+         if (coding->eol_seen != EOL_SEEN_NONE)
+           adjust_coding_eol_type (coding, coding->eol_seen);
 
-         coding_set_source (coding);
-         src_end = coding->source + coding->src_bytes;
-         for (src = coding->source; src < src_end; src++)
+         if (coding->eol_seen == EOL_SEEN_CR)
            {
-             if (*src & 0x80)
-               break;
+             unsigned char *src_end = GAP_END_ADDR;
+             unsigned char *src = src_end - coding->src_bytes;
+
+             while (src < src_end)
+               {
+                 if (*src++ == '\r')
+                   src[-1] = '\n';
+               }
+           }
+         else if (coding->eol_seen == EOL_SEEN_CRLF)
+           {
+             unsigned char *src = GAP_END_ADDR;
+             unsigned char *src_beg = src - coding->src_bytes;
+             unsigned char *dst = src;
+             ptrdiff_t diff;
+
+             while (src_beg < src)
+               {
+                 *--dst = *--src;
+                 if (*src == '\n' && src > src_beg && src[-1] == '\r')
+                   src--;
+               }
+             diff = dst - src;
+             bytes -= diff;
+             chars -= diff;
            }
-         coding->head_ascii = src - coding->source;
-       }
-      if (coding->src_bytes == coding->head_ascii)
-       {
-         /* No need of conversion.  Use the data in the gap as is.  */
-         coding->produced_char = chars;
          coding->produced = bytes;
-         adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
+         coding->produced_char = chars;
+         insert_from_gap (chars, bytes, 1);
          return;
        }
     }
-#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */
   code_conversion_save (0, 0);
 
   coding->mode |= CODING_MODE_LAST_BLOCK;
@@ -10758,6 +11002,11 @@ from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
 decode text as usual.  */);
   inhibit_null_byte_detection = 0;
 
+  DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
+              doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
+Internal use only.  Removed after the experimental optimizer gets stable. */);
+  disable_ascii_optimization = 0;
+
   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
               doc: /* Char table for translating self-inserting characters.
 This is applied to the result of input methods, not their input.