X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/eb7a410c147507ffdf0e84d163a014acb82b19a2..3fa2054efdfa3c22456072254e6c67682a595233:/src/coding.c

diff --git a/src/coding.c b/src/coding.c
index f9799035b3..c10fb37567 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -493,6 +493,8 @@ enum iso_code_class_type
 
 #define CODING_ISO_FLAG_USE_OLDJIS	0x10000
 
+#define CODING_ISO_FLAG_LEVEL_4		0x20000
+
 #define CODING_ISO_FLAG_FULL_SUPPORT	0x100000
 
 /* A character to be produced on output if encoding of the original
@@ -649,12 +651,45 @@ static struct coding_system coding_categories[coding_category_max];
 #define max(a, b) ((a) > (b) ? (a) : (b))
 #endif
 
+/* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
+
+static int
+encode_inhibit_flag (Lisp_Object flag)
+{
+  return NILP (flag) ? -1 : EQ (flag, Qt);
+}
+
+/* True if the value of ENCODED_FLAG says a flag should be treated as set.
+   1 means yes, -1 means no, 0 means ask the user variable VAR.  */
+
+static bool
+inhibit_flag (int encoded_flag, bool var)
+{
+  return 0 < encoded_flag + var;
+}
+
 #define CODING_GET_INFO(coding, attrs, charset_list)	\
   do {							\
     (attrs) = CODING_ID_ATTRS ((coding)->id);		\
     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);	\
   } while (0)
 
+static void
+CHECK_NATNUM_CAR (Lisp_Object x)
+{
+  Lisp_Object tmp = XCAR (x);
+  CHECK_NATNUM (tmp);
+  XSETCAR (x, tmp);
+}
+
+static void
+CHECK_NATNUM_CDR (Lisp_Object x)
+{
+  Lisp_Object tmp = XCDR (x);
+  CHECK_NATNUM (tmp);
+  XSETCDR (x, tmp);
+}
+
 
 /* Safely get one byte from the source text pointed by SRC which ends
    at SRC_END, and set C to that byte.  If there are not enough bytes
@@ -1125,6 +1160,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
     *buf++ = id;							\
   } while (0)
 
+
+/* Bitmasks for coding->eol_seen.  */
+
+#define EOL_SEEN_NONE	0
+#define EOL_SEEN_LF	1
+#define EOL_SEEN_CR	2
+#define EOL_SEEN_CRLF	4
+
 
 /*** 2. Emacs' internal format (emacs-utf-8) ***/
 
@@ -1147,6 +1190,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
 #define UTF_8_BOM_2 0xBB
 #define UTF_8_BOM_3 0xBF
 
+/* Unlike the other detect_coding_XXX, this function counts number of
+   characters and check EOL format.  */
+
 static bool
 detect_coding_utf_8 (struct coding_system *coding,
 		     struct coding_detection_info *detect_info)
@@ -1156,11 +1202,23 @@ detect_coding_utf_8 (struct coding_system *coding,
   bool multibytep = coding->src_multibyte;
   ptrdiff_t consumed_chars = 0;
   bool bom_found = 0;
-  bool found = 0;
+  int nchars = coding->head_ascii;
+  int eol_seen = coding->eol_seen;
 
   detect_info->checked |= CATEGORY_MASK_UTF_8;
   /* A coding system of this category is always ASCII compatible.  */
-  src += coding->head_ascii;
+  src += nchars;
+
+  if (src == coding->source	/* BOM should be at the head.  */
+      && src + 3 < src_end	/* BOM is 3-byte long.  */
+      && src[0] == UTF_8_BOM_1
+      && src[1] == UTF_8_BOM_2
+      && src[2] == UTF_8_BOM_3)
+    {
+      bom_found = 1;
+      src += 3;
+      nchars++;
+    }
 
   while (1)
     {
@@ -1169,13 +1227,29 @@ detect_coding_utf_8 (struct coding_system *coding,
       src_base = src;
       ONE_MORE_BYTE (c);
       if (c < 0 || UTF_8_1_OCTET_P (c))
-	continue;
+	{
+	  nchars++;
+	  if (c == '\r')
+	    {
+	      if (src < src_end && *src == '\n')
+		{
+		  eol_seen |= EOL_SEEN_CRLF;
+		  src++;
+		  nchars++;
+		}
+	      else
+		eol_seen |= EOL_SEEN_CR;
+	    }
+	  else if (c == '\n')
+	    eol_seen |= EOL_SEEN_LF;
+	  continue;
+	}
       ONE_MORE_BYTE (c1);
       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
 	break;
       if (UTF_8_2_OCTET_LEADING_P (c))
 	{
-	  found = 1;
+	  nchars++;
 	  continue;
 	}
       ONE_MORE_BYTE (c2);
@@ -1183,10 +1257,7 @@ detect_coding_utf_8 (struct coding_system *coding,
 	break;
       if (UTF_8_3_OCTET_LEADING_P (c))
 	{
-	  found = 1;
-	  if (src_base == coding->source
-	      && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
-	    bom_found = 1;
+	  nchars++;
 	  continue;
 	}
       ONE_MORE_BYTE (c3);
@@ -1194,7 +1265,7 @@ detect_coding_utf_8 (struct coding_system *coding,
 	break;
       if (UTF_8_4_OCTET_LEADING_P (c))
 	{
-	  found = 1;
+	  nchars++;
 	  continue;
 	}
       ONE_MORE_BYTE (c4);
@@ -1202,7 +1273,7 @@ detect_coding_utf_8 (struct coding_system *coding,
 	break;
       if (UTF_8_5_OCTET_LEADING_P (c))
 	{
-	  found = 1;
+	  nchars++;
 	  continue;
 	}
       break;
@@ -1219,14 +1290,17 @@ detect_coding_utf_8 (struct coding_system *coding,
   if (bom_found)
     {
       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
-      detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
+      detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
     }
   else
     {
       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
-      if (found)
-	detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
+      if (nchars < src_end - coding->source)
+	/* The found characters are less than source bytes, which
+	   means that we found a valid non-ASCII characters.  */
+	detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
     }
+  coding->detected_utf8_chars = nchars;
   return 1;
 }
 
@@ -1291,6 +1365,45 @@ decode_coding_utf_8 (struct coding_system *coding)
 	  break;
 	}
 
+      /* In the simple case, rapidly handle ordinary characters */
+      if (multibytep && ! eol_dos
+	  && charbuf < charbuf_end - 6 && src < src_end - 6)
+	{
+	  while (charbuf < charbuf_end - 6 && src < src_end - 6)
+	    {
+	      c1 = *src;
+	      if (c1 & 0x80)
+		break;
+	      src++;
+	      consumed_chars++;
+	      *charbuf++ = c1;
+
+	      c1 = *src;
+	      if (c1 & 0x80)
+		break;
+	      src++;
+	      consumed_chars++;
+	      *charbuf++ = c1;
+
+	      c1 = *src;
+	      if (c1 & 0x80)
+		break;
+	      src++;
+	      consumed_chars++;
+	      *charbuf++ = c1;
+
+	      c1 = *src;
+	      if (c1 & 0x80)
+		break;
+	      src++;
+	      consumed_chars++;
+	      *charbuf++ = c1;
+	    }
+	  /* If we handled at least one character, restart the main loop.  */
+	  if (src != src_base)
+	    continue;
+	}
+
       if (byte_after_cr >= 0)
 	c1 = byte_after_cr, byte_after_cr = -1;
       else
@@ -3661,7 +3774,10 @@ decode_coding_iso_2022 (struct coding_system *coding)
 	      else
 		charset = CHARSET_FROM_ID (charset_id_2);
 	      ONE_MORE_BYTE (c1);
-	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
+	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
+		  || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
+		      && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
+			  ? c1 >= 0x80 : c1 < 0x80)))
 		goto invalid_code;
 	      break;
 
@@ -3675,7 +3791,10 @@ decode_coding_iso_2022 (struct coding_system *coding)
 	      else
 		charset = CHARSET_FROM_ID (charset_id_3);
 	      ONE_MORE_BYTE (c1);
-	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
+	      if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
+		  || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
+		      && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
+			  ? c1 >= 0x80 : c1 < 0x80)))
 		goto invalid_code;
 	      break;
 
@@ -3887,6 +4006,14 @@ decode_coding_iso_2022 (struct coding_system *coding)
       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
       char_offset++;
       coding->errors++;
+      /* Reset the invocation and designation status to the safest
+	 one; i.e. designate ASCII to the graphic register 0, and
+	 invoke that register to the graphic plane 0.  This typically
+	 helps the case that an designation sequence for ASCII "ESC (
+	 B" is somehow broken (e.g. broken by a newline).  */
+      CODING_ISO_INVOCATION (coding, 0) = 0;
+      CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
+      charset_id_0 = charset_ascii;
       continue;
 
     break_loop:
@@ -5614,7 +5741,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
 
   coding->mode = 0;
-  coding->head_ascii = -1;
   if (VECTORP (eol_type))
     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
 			    | CODING_REQUIRE_DETECTION_MASK);
@@ -5643,6 +5769,14 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
       coding->decoder = decode_coding_raw_text;
       coding->encoder = encode_coding_raw_text;
       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
+      coding->spec.undecided.inhibit_nbd
+	= (encode_inhibit_flag
+	   (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
+      coding->spec.undecided.inhibit_ied
+	= (encode_inhibit_flag
+	   (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
+      coding->spec.undecided.prefer_utf_8
+	= ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
     }
   else if (EQ (coding_type, Qiso_2022))
     {
@@ -6066,46 +6200,35 @@ complement_process_encoding_system (Lisp_Object coding_system)
 
 */
 
-#define EOL_SEEN_NONE	0
-#define EOL_SEEN_LF	1
-#define EOL_SEEN_CR	2
-#define EOL_SEEN_CRLF	4
-
-
 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
 					   int eol_seen);
 
 
 /* Return the number of ASCII characters at the head of the source.
-   By side effects, set coding->head_ascii and coding->eol_seen.  The
-   value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
-   EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
-   all the source bytes are ASCII.  */
+   By side effects, set coding->head_ascii and update
+   coding->eol_seen.  The value of coding->eol_seen is "logical or" of
+   EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
+   reliable only when all the source bytes are ASCII.  */
 
 static int
 check_ascii (struct coding_system *coding)
 {
   const unsigned char *src, *end;
   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
-  int eol_seen;
+  int eol_seen = coding->eol_seen;
 
-  eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
-	      : EQ (eol_type, Qunix) ? EOL_SEEN_LF
-	      : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
-	      : EOL_SEEN_CR);
   coding_set_source (coding);
   src = coding->source;
   end = src + coding->src_bytes;
 
   if (inhibit_eol_conversion
-      || eol_seen != EOL_SEEN_NONE)
+      || SYMBOLP (eol_type))
     {
       /* We don't have to check EOL format.  */
-      while (src < end && !( *src & 0x80)) src++;
-      if (inhibit_eol_conversion)
+      while (src < end && !( *src & 0x80))
 	{
-	  eol_seen = EOL_SEEN_LF;
-	  adjust_coding_eol_type (coding, eol_seen);
+	  if (*src++ == '\n')
+	    eol_seen |= EOL_SEEN_LF;
 	}
     }
   else
@@ -6163,7 +6286,7 @@ static int
 check_utf_8 (struct coding_system *coding)
 {
   const unsigned char *src, *end;
-  int eol_seen = coding->eol_seen;
+  int eol_seen;
   int nchars = coding->head_ascii;
 
   if (coding->head_ascii < 0)
@@ -6173,7 +6296,7 @@ check_utf_8 (struct coding_system *coding)
   src = coding->source + coding->head_ascii;
   /* We look ahead one byte for CR LF.  */
   end = coding->source + coding->src_bytes - 1;
-
+  eol_seen = coding->eol_seen;
   while (src < end)
     {
       int c = *src;
@@ -6394,6 +6517,8 @@ detect_coding (struct coding_system *coding)
 {
   const unsigned char *src, *src_end;
   unsigned int saved_mode = coding->mode;
+  Lisp_Object found = Qnil;
+  Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
 
   coding->consumed = coding->consumed_char = 0;
   coding->produced = coding->produced_char = 0;
@@ -6401,6 +6526,7 @@ detect_coding (struct coding_system *coding)
 
   src_end = coding->source + coding->src_bytes;
 
+  coding->eol_seen = EOL_SEEN_NONE;
   /* If we have not yet decided the text encoding type, detect it
      now.  */
   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6408,9 +6534,13 @@ detect_coding (struct coding_system *coding)
       int c, i;
       struct coding_detection_info detect_info;
       bool null_byte_found = 0, eight_bit_found = 0;
+      bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
+				       inhibit_null_byte_detection);
+      bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
+				       inhibit_iso_escape_detection);
+      bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
 
       coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
       detect_info.checked = detect_info.found = detect_info.rejected = 0;
       for (src = coding->source; src < src_end; src++)
 	{
@@ -6424,7 +6554,7 @@ detect_coding (struct coding_system *coding)
 	  else if (c < 0x20)
 	    {
 	      if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
-		  && ! inhibit_iso_escape_detection
+		  && ! inhibit_ied
 		  && ! detect_info.checked)
 		{
 		  if (detect_coding_iso_2022 (coding, &detect_info))
@@ -6443,7 +6573,7 @@ detect_coding (struct coding_system *coding)
 		      break;
 		    }
 		}
-	      else if (! c && !inhibit_null_byte_detection)
+	      else if (! c && !inhibit_nbd)
 		{
 		  null_byte_found = 1;
 		  if (eight_bit_found)
@@ -6500,6 +6630,12 @@ detect_coding (struct coding_system *coding)
 		  detect_info.checked |= ~CATEGORY_MASK_UTF_16;
 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
 		}
+	      else if (prefer_utf_8
+		       && detect_coding_utf_8 (coding, &detect_info))
+		{
+		  detect_info.checked |= ~CATEGORY_MASK_UTF_8;
+		  detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
+		}
 	      for (i = 0; i < coding_category_raw_text; i++)
 		{
 		  category = coding_priorities[i];
@@ -6521,32 +6657,58 @@ detect_coding (struct coding_system *coding)
 		    }
 		  else if ((*(this->detector)) (coding, &detect_info)
 			   && detect_info.found & (1 << category))
-		    {
-		      if (category == coding_category_utf_16_auto)
-			{
-			  if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-			    category = coding_category_utf_16_le;
-			  else
-			    category = coding_category_utf_16_be;
-			}
-		      break;
-		    }
+		    break;
 		}
 	    }
 
 	  if (i < coding_category_raw_text)
-	    setup_coding_system (CODING_ID_NAME (this->id), coding);
+	    {
+	      if (category == coding_category_utf_8_auto)
+		{
+		  Lisp_Object coding_systems;
+
+		  coding_systems = AREF (CODING_ID_ATTRS (this->id),
+					 coding_attr_utf_bom);
+		  if (CONSP (coding_systems))
+		    {
+		      if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+			found = XCAR (coding_systems);
+		      else
+			found = XCDR (coding_systems);
+		    }
+		  else
+		    found = CODING_ID_NAME (this->id);
+		}
+	      else if (category == coding_category_utf_16_auto)
+		{
+		  Lisp_Object coding_systems;
+
+		  coding_systems = AREF (CODING_ID_ATTRS (this->id),
+					 coding_attr_utf_bom);
+		  if (CONSP (coding_systems))
+		    {
+		      if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
+			found = XCAR (coding_systems);
+		      else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
+			found = XCDR (coding_systems);
+		    }
+		  else
+		    found = CODING_ID_NAME (this->id);
+		}
+	      else
+		found = CODING_ID_NAME (this->id);
+	    }
 	  else if (null_byte_found)
-	    setup_coding_system (Qno_conversion, coding);
+	    found = Qno_conversion;
 	  else if ((detect_info.rejected & CATEGORY_MASK_ANY)
 		   == CATEGORY_MASK_ANY)
-	    setup_coding_system (Qraw_text, coding);
+	    found = Qraw_text;
 	  else if (detect_info.rejected)
 	    for (i = 0; i < coding_category_raw_text; i++)
 	      if (! (detect_info.rejected & (1 << coding_priorities[i])))
 		{
 		  this = coding_categories + coding_priorities[i];
-		  setup_coding_system (CODING_ID_NAME (this->id), coding);
+		  found = CODING_ID_NAME (this->id);
 		  break;
 		}
 	}
@@ -6562,12 +6724,8 @@ detect_coding (struct coding_system *coding)
       detect_info.found = detect_info.rejected = 0;
       if (check_ascii (coding) == coding->src_bytes)
 	{
-	  int head_ascii = coding->head_ascii;
-
-	  if (coding->eol_seen != EOL_SEEN_NONE)
-	    adjust_coding_eol_type (coding, coding->eol_seen);
-	  setup_coding_system (XCDR (coding_systems), coding);
-	  coding->head_ascii = head_ascii;
+	  if (CONSP (coding_systems))
+	    found = XCDR (coding_systems);
 	}
       else
 	{
@@ -6575,9 +6733,9 @@ detect_coding (struct coding_system *coding)
 	      && detect_coding_utf_8 (coding, &detect_info))
 	    {
 	      if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
-		setup_coding_system (XCAR (coding_systems), coding);
+		found = XCAR (coding_systems);
 	      else
-		setup_coding_system (XCDR (coding_systems), coding);
+		found = XCDR (coding_systems);
 	    }
 	}
     }
@@ -6591,16 +6749,28 @@ detect_coding (struct coding_system *coding)
 	= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
       detect_info.found = detect_info.rejected = 0;
       coding->head_ascii = 0;
-      coding->eol_seen = EOL_SEEN_NONE;
       if (CONSP (coding_systems)
 	  && detect_coding_utf_16 (coding, &detect_info))
 	{
 	  if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
-	    setup_coding_system (XCAR (coding_systems), coding);
+	    found = XCAR (coding_systems);
 	  else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
-	    setup_coding_system (XCDR (coding_systems), coding);
+	    found = XCDR (coding_systems);
 	}
     }
+
+  if (! NILP (found))
+    {
+      int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
+			   : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+			   : EQ (eol_type, Qmac) ? EOL_SEEN_CR
+			   : EOL_SEEN_LF);
+
+      setup_coding_system (found, coding);
+      if (specified_eol != EOL_SEEN_NONE)
+	adjust_coding_eol_type (coding, specified_eol);
+    }
+
   coding->mode = saved_mode;
 }
 
@@ -6741,11 +6911,9 @@ get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
       if (CHAR_TABLE_P (standard))
 	{
 	  if (CONSP (translation_table))
-	    translation_table = nconc2 (translation_table,
-					Fcons (standard, Qnil));
+	    translation_table = nconc2 (translation_table, list1 (standard));
 	  else
-	    translation_table = Fcons (translation_table,
-				       Fcons (standard, Qnil));
+	    translation_table = list2 (translation_table, standard);
 	}
     }
 
@@ -7329,7 +7497,7 @@ handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
 	  /* We found a composition.  Store the corresponding
 	     annotation data in BUF.  */
 	  int *head = buf;
-	  enum composition_method method = COMPOSITION_METHOD (prop);
+	  enum composition_method method = composition_method (prop);
 	  int nchars = COMPOSITION_LENGTH (prop);
 
 	  ADD_COMPOSITION_DATA (buf, nchars, 0, method);
@@ -7670,7 +7838,7 @@ make_conversion_work_buffer (bool multibyte)
 }
 
 
-static Lisp_Object
+static void
 code_conversion_restore (Lisp_Object arg)
 {
   Lisp_Object current, workbuf;
@@ -7688,7 +7856,6 @@ code_conversion_restore (Lisp_Object arg)
     }
   set_buffer_internal (XBUFFER (current));
   UNGCPRO;
-  return Qnil;
 }
 
 Lisp_Object
@@ -7721,6 +7888,9 @@ decode_coding_gap (struct coding_system *coding,
   coding->dst_pos_byte = PT_BYTE;
   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
 
+  coding->head_ascii = -1;
+  coding->detected_utf8_chars = -1;
+  coding->eol_seen = EOL_SEEN_NONE;
   if (CODING_REQUIRE_DETECTION (coding))
     detect_coding (coding);
   attrs = CODING_ID_ATTRS (coding->id);
@@ -7735,17 +7905,38 @@ decode_coding_gap (struct coding_system *coding,
 	chars = check_ascii (coding);
       if (chars != bytes)
 	{
+	  /* There exists a non-ASCII byte.  */
 	  if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
-	    chars = check_utf_8 (coding);
+	    {
+	      if (coding->detected_utf8_chars >= 0)
+		chars = coding->detected_utf8_chars;
+	      else
+		chars = check_utf_8 (coding);
+	      if (CODING_UTF_8_BOM (coding) != utf_without_bom
+		  && coding->head_ascii == 0
+		  && coding->source[0] == UTF_8_BOM_1
+		  && coding->source[1] == UTF_8_BOM_2
+		  && coding->source[2] == UTF_8_BOM_3)
+		{
+		  chars--;
+		  bytes -= 3;
+		  coding->src_bytes -= 3;
+		}
+	    }
 	  else
 	    chars = -1;
 	}
       if (chars >= 0)
 	{
-	  if (coding->eol_seen != EOL_SEEN_NONE)
-	    adjust_coding_eol_type (coding, coding->eol_seen);
+	  Lisp_Object eol_type;
 
-	  if (coding->eol_seen == EOL_SEEN_CR)
+	  eol_type = CODING_ID_EOL_TYPE (coding->id);
+	  if (VECTORP (eol_type))
+	    {
+	      if (coding->eol_seen != EOL_SEEN_NONE)
+		eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
+	    }
+	  if (EQ (eol_type, Qmac))
 	    {
 	      unsigned char *src_end = GAP_END_ADDR;
 	      unsigned char *src = src_end - coding->src_bytes;
@@ -7756,7 +7947,7 @@ decode_coding_gap (struct coding_system *coding,
 		    src[-1] = '\n';
 		}
 	    }
-	  else if (coding->eol_seen == EOL_SEEN_CRLF)
+	  else if (EQ (eol_type, Qdos))
 	    {
 	      unsigned char *src = GAP_END_ADDR;
 	      unsigned char *src_beg = src - coding->src_bytes;
@@ -7766,7 +7957,7 @@ decode_coding_gap (struct coding_system *coding,
 	      while (src_beg < src)
 		{
 		  *--dst = *--src;
-		  if (*src == '\n')
+		  if (*src == '\n' && src > src_beg && src[-1] == '\r')
 		    src--;
 		}
 	      diff = dst - src;
@@ -8403,6 +8594,11 @@ detect_coding_system (const unsigned char *src,
       enum coding_category category IF_LINT (= 0);
       struct coding_system *this IF_LINT (= NULL);
       int c, i;
+      bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
+				       inhibit_null_byte_detection);
+      bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
+				       inhibit_iso_escape_detection);
+      bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
 
       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
       for (; src < src_end; src++)
@@ -8417,7 +8613,7 @@ detect_coding_system (const unsigned char *src,
 	  else if (c < 0x20)
 	    {
 	      if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
-		  && ! inhibit_iso_escape_detection
+		  && ! inhibit_ied
 		  && ! detect_info.checked)
 		{
 		  if (detect_coding_iso_2022 (&coding, &detect_info))
@@ -8436,7 +8632,7 @@ detect_coding_system (const unsigned char *src,
 		      break;
 		    }
 		}
-	      else if (! c && !inhibit_null_byte_detection)
+	      else if (! c && !inhibit_nbd)
 		{
 		  null_byte_found = 1;
 		  if (eight_bit_found)
@@ -8469,6 +8665,12 @@ detect_coding_system (const unsigned char *src,
 		  detect_info.checked |= ~CATEGORY_MASK_UTF_16;
 		  detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
 		}
+	      else if (prefer_utf_8
+		       && detect_coding_utf_8 (&coding, &detect_info))
+		{
+		  detect_info.checked |= ~CATEGORY_MASK_UTF_8;
+		  detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
+		}
 	      for (i = 0; i < coding_category_raw_text; i++)
 		{
 		  category = coding_priorities[i];
@@ -8509,20 +8711,20 @@ detect_coding_system (const unsigned char *src,
 	{
 	  detect_info.found = CATEGORY_MASK_RAW_TEXT;
 	  id = CODING_SYSTEM_ID (Qno_conversion);
-	  val = Fcons (make_number (id), Qnil);
+	  val = list1 (make_number (id));
 	}
       else if (! detect_info.rejected && ! detect_info.found)
 	{
 	  detect_info.found = CATEGORY_MASK_ANY;
 	  id = coding_categories[coding_category_undecided].id;
-	  val = Fcons (make_number (id), Qnil);
+	  val = list1 (make_number (id));
 	}
       else if (highest)
 	{
 	  if (detect_info.found)
 	    {
 	      detect_info.found = 1 << category;
-	      val = Fcons (make_number (this->id), Qnil);
+	      val = list1 (make_number (this->id));
 	    }
 	  else
 	    for (i = 0; i < coding_category_raw_text; i++)
@@ -8530,7 +8732,7 @@ detect_coding_system (const unsigned char *src,
 		{
 		  detect_info.found = 1 << coding_priorities[i];
 		  id = coding_categories[coding_priorities[i]].id;
-		  val = Fcons (make_number (id), Qnil);
+		  val = list1 (make_number (id));
 		  break;
 		}
 	}
@@ -8547,7 +8749,7 @@ detect_coding_system (const unsigned char *src,
 		  found |= 1 << category;
 		  id = coding_categories[category].id;
 		  if (id >= 0)
-		    val = Fcons (make_number (id), val);
+		    val = list1 (make_number (id));
 		}
 	    }
 	  for (i = coding_category_raw_text - 1; i >= 0; i--)
@@ -8572,7 +8774,7 @@ detect_coding_system (const unsigned char *src,
 	    this = coding_categories + coding_category_utf_8_sig;
 	  else
 	    this = coding_categories + coding_category_utf_8_nosig;
-	  val = Fcons (make_number (this->id), Qnil);
+	  val = list1 (make_number (this->id));
 	}
     }
   else if (base_category == coding_category_utf_16_auto)
@@ -8589,13 +8791,13 @@ detect_coding_system (const unsigned char *src,
 	    this = coding_categories + coding_category_utf_16_be_nosig;
 	  else
 	    this = coding_categories + coding_category_utf_16_le_nosig;
-	  val = Fcons (make_number (this->id), Qnil);
+	  val = list1 (make_number (this->id));
 	}
     }
   else
     {
       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
-      val = Fcons (make_number (coding.id), Qnil);
+      val = list1 (make_number (coding.id));
     }
 
   /* Then, detect eol-format if necessary.  */
@@ -8807,8 +9009,7 @@ DEFUN ("find-coding-systems-region-internal",
 	Lisp_Object attrs;
 
 	attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
-	if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
-	    && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
+	if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
 	  {
 	    ASET (attrs, coding_attr_trans_tbl,
 		  get_translation_table (attrs, 1, NULL));
@@ -9067,7 +9268,7 @@ is nil.  */)
       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
       ASET (attrs, coding_attr_trans_tbl,
 	    get_translation_table (attrs, 1, NULL));
-      list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
+      list = Fcons (list2 (elt, attrs), list);
     }
 
   if (STRINGP (start))
@@ -9478,7 +9679,7 @@ DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_intern
   tset_charset_list
     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
 	    ? coding_charset_list (terminal_coding)
-	    : Fcons (make_number (charset_ascii), Qnil)));
+	    : list1 (make_number (charset_ascii))));
   return Qnil;
 }
 
@@ -9923,9 +10124,9 @@ usage: (define-coding-system-internal ...)  */)
 		{
 		  dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
 		  if (dim < dim2)
-		    tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
+		    tmp = list2 (XCAR (tail), tmp);
 		  else
-		    tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
+		    tmp = list2 (tmp, XCAR (tail));
 		}
 	      else
 		{
@@ -9936,7 +10137,7 @@ usage: (define-coding-system-internal ...)  */)
 			break;
 		    }
 		  if (NILP (tmp2))
-		    tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
+		    tmp = nconc2 (tmp, list1 (XCAR (tail)));
 		  else
 		    {
 		      XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
@@ -10222,7 +10423,17 @@ usage: (define-coding-system-internal ...)  */)
 		  : coding_category_utf_8_sig);
     }
   else if (EQ (coding_type, Qundecided))
-    category = coding_category_undecided;
+    {
+      if (nargs < coding_arg_undecided_max)
+	goto short_args;
+      ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
+	    args[coding_arg_undecided_inhibit_null_byte_detection]);
+      ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
+	    args[coding_arg_undecided_inhibit_iso_escape_detection]);
+      ASET (attrs, coding_attr_undecided_prefer_utf_8,
+	    args[coding_arg_undecided_prefer_utf_8]);
+      category = coding_category_undecided;
+    }
   else
     error ("Invalid coding system type: %s",
 	   SDATA (SYMBOL_NAME (coding_type)));
@@ -10244,7 +10455,7 @@ usage: (define-coding-system-internal ...)  */)
       && ! EQ (eol_type, Qmac))
     error ("Invalid eol-type");
 
-  aliases = Fcons (name, Qnil);
+  aliases = list1 (name);
 
   if (NILP (eol_type))
     {
@@ -10254,7 +10465,7 @@ usage: (define-coding-system-internal ...)  */)
 	  Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
 
 	  this_name = AREF (eol_type, i);
-	  this_aliases = Fcons (this_name, Qnil);
+	  this_aliases = list1 (this_name);
 	  this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
 	  this_spec = make_uninit_vector (3);
 	  ASET (this_spec, 0, attrs);
@@ -10369,7 +10580,7 @@ DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
      list.  */
   while (!NILP (XCDR (aliases)))
     aliases = XCDR (aliases);
-  XSETCDR (aliases, Fcons (alias, Qnil));
+  XSETCDR (aliases, list1 (alias));
 
   eol_type = AREF (spec, 2);
   if (VECTORP (eol_type))
@@ -10603,11 +10814,6 @@ syms_of_coding (void)
   Fput (Qcoding_system_error, Qerror_message,
 	build_pure_c_string ("Invalid coding system"));
 
-  /* Intern this now in case it isn't already done.
-     Setting this variable twice is harmless.
-     But don't staticpro it here--that is done in alloc.c.  */
-  Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
-
   DEFSYM (Qtranslation_table, "translation-table");
   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
   DEFSYM (Qtranslation_table_id, "translation-table-id");
@@ -11010,11 +11216,11 @@ internal character representation.  */);
     Vtranslation_table_for_input = Qnil;
 
   {
-    Lisp_Object args[coding_arg_max];
+    Lisp_Object args[coding_arg_undecided_max];
     Lisp_Object plist[16];
     int i;
 
-    for (i = 0; i < coding_arg_max; i++)
+    for (i = 0; i < coding_arg_undecided_max; i++)
       args[i] = Qnil;
 
     plist[0] = intern_c_string (":name");
@@ -11051,7 +11257,9 @@ character.");
     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
     plist[15] = args[coding_arg_eol_type] = Qnil;
     args[coding_arg_plist] = Flist (16, plist);
-    Fdefine_coding_system_internal (coding_arg_max, args);
+    args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
+    args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
+    Fdefine_coding_system_internal (coding_arg_undecided_max, args);
   }
 
   setup_coding_system (Qno_conversion, &safe_terminal_coding);