X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/1abfd3e85fa9b340699430cd9e15dd9f0073bdbe..8dd58a2d1fedaa16573bc67e986dc2014620c681:/lisp/international/mule.el

diff --git a/lisp/international/mule.el b/lisp/international/mule.el
index 47c7087518..181474c65c 100644
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1,6 +1,6 @@
 ;;; mule.el --- basic commands for multilingual environment
 
-;; Copyright (C) 1997-2013 Free Software Foundation, Inc.
+;; Copyright (C) 1997-2015 Free Software Foundation, Inc.
 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
 ;;   2005, 2006, 2007, 2008, 2009, 2010, 2011
 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
@@ -407,12 +407,12 @@ PLIST (property list) may contain any type of information a user
 ;; because that makes a bootstrapping problem
 ;; if you need to recompile all the Lisp files using interpreted code.
 
-(defun charset-id (charset)
+(defun charset-id (_charset)
   "Always return 0.  This is provided for backward compatibility."
   (declare (obsolete nil "23.1"))
   0)
 
-(defmacro charset-bytes (charset)
+(defmacro charset-bytes (_charset)
   "Always return 0.  This is provided for backward compatibility."
   (declare (obsolete nil "23.1"))
   0)
@@ -471,7 +471,7 @@ Return -1 if charset isn't an ISO 2022 one."
 ;;; CHARACTER
 (define-obsolete-function-alias 'char-valid-p 'characterp "23.1")
 
-(defun generic-char-p (char)
+(defun generic-char-p (_char)
   "Always return nil.  This is provided for backward compatibility."
   (declare (obsolete nil "23.1"))
   nil)
@@ -518,7 +518,8 @@ Return -1 if charset isn't an ISO 2022 one."
     composition
     euc-tw-shift
     use-roman
-    use-oldjis)
+    use-oldjis
+    8-bit-level-4)
   "List of symbols that control ISO-2022 encoder/decoder.
 
 The value of the `:flags' attribute in the argument of the function
@@ -542,8 +543,9 @@ If `locking-shift' is specified, decode locking-shift code correctly
 on decoding, and use locking-shift to invoke a graphic element on
 encoding.
 
-If `single-shift' is specified, decode single-shift code correctly on
-decoding, and use single-shift to invoke a graphic element on encoding.
+If `single-shift' is specified, decode single-shift code
+correctly on decoding, and use single-shift to invoke a graphic
+element on encoding.  See also `8-bit-level-4' specification.
 
 If `designation' is specified, decode designation code correctly on
 decoding, and use designation to designate a charset to a graphic
@@ -578,13 +580,42 @@ If `use-roman' is specified, JIS0201-1976-Roman is designated instead
 of ASCII.
 
 If `use-oldjis' is specified, JIS0208-1976 is designated instead of
-JIS0208-1983.")
+JIS0208-1983.
+
+If `8-bit-level-4' is specified, the decoder assumes the
+implementation level \"4\" for 8-bit codes which means that GL is
+identified as the single-shift area.  The default implementation
+level for 8-bit code is \"4A\" which means that GR is identified
+as the single-shift area.")
 
 (defun define-coding-system (name docstring &rest props)
   "Define NAME (a symbol) as a coding system with DOCSTRING and attributes.
 The remaining arguments must come in pairs ATTRIBUTE VALUE.  ATTRIBUTE
 may be any symbol.
 
+A coding system specifies a rule to decode (i.e. to convert a
+byte sequence to a character sequence) and a rule to encode (the
+opposite of decoding).
+
+The decoding is done by at most 3 steps; the first is to convert
+a byte sequence to a character sequence by one of Emacs'
+internal routines specified by `:coding-type' attribute.  The
+optional second step is to convert the character sequence (the
+result of the first step) by a translation table specified
+by `:decode-translation-table' attribute.  The optional third step
+is to convert the above result by a Lisp function specified
+by `:post-read-conversion' attribute.
+
+The encoding is done by at most 3 steps, which are the reverse
+of the decoding steps.  The optional first step converts a
+character sequence to another character sequence by a Lisp
+function specified by `:pre-write-conversion' attribute.  The
+optional second step converts the above result by a translation
+table specified by `:encode-translation-table' attribute.  The
+third step converts the above result to a byte sequence by one
+of the Emacs's internal routines specified by the `:coding-type'
+attribute.
+
 The following attributes have special meanings.  Those labeled as
 \"(required)\" should not be omitted.
 
@@ -594,27 +625,72 @@ VALUE is a character to display on mode line for the coding system.
 
 `:coding-type' (required)
 
-VALUE must be one of `charset', `utf-8', `utf-16', `iso-2022',
-`emacs-mule', `shift-jis', `ccl', `raw-text', `undecided'.
+VALUE specifies the format of byte sequence the coding system
+decodes and encodes to.  It must be one of `charset', `utf-8',
+`utf-16', `iso-2022', `emacs-mule', `shift-jis', `ccl',
+`raw-text', `undecided'.
+
+If VALUE is `charset', the coding system is for handling a
+byte sequence in which each byte or every two- to four-byte
+sequence represents a character code of a charset specified
+by the `:charset-list' attribute.
+
+If VALUE is `utf-8', the coding system is for handling Unicode
+UTF-8 byte sequences.  See also the documentation of the
+attribute `:bom'.
+
+If VALUE is `utf-16', the coding system is for handling Unicode
+UTF-16 byte sequences.  See also the documentation of the
+attributes :bom and `:endian'.
+
+If VALUE is `iso-2022', the coding system is for handling byte
+sequences conforming to ISO/IEC 2022.  See also the documentation
+of the attributes `:charset-list', `:flags', and `:designation'.
+
+If VALUE is `emacs-mule', the coding system is for handling
+byte sequences which Emacs 20 and 21 used for their internal
+representation of characters.
+
+If VALUE is `shift-jis', the coding system is for handling byte
+sequences of Shift_JIS format.  See also the attribute `:charset-list'.
+
+If VALUE is `ccl', the coding system uses CCL programs to decode
+and encode byte sequences.  The CCL programs must be
+specified by the attributes `:ccl-decoder' and `:ccl-encoder'.
+
+If VALUE is `raw-text', the coding system decodes byte sequences
+without any conversions.
 
 `:eol-type'
 
 VALUE is the EOL (end-of-line) format of the coding system.  It must be
 one of `unix', `dos', `mac'.  The symbol `unix' means Unix-like EOL
-\(i.e. single LF), `dos' means DOS-like EOL \(i.e. sequence of CR LF),
-and `mac' means Mac-like EOL \(i.e. single CR).  If omitted, Emacs
-detects the EOL format automatically when decoding.
+\(i.e. a single LF character), `dos' means DOS-like EOL \(i.e. a sequence
+of CR followed by LF), and `mac' means Mac-like EOL \(i.e. a single CR).
+If omitted, Emacs detects the EOL format automatically when decoding.
+
+`:charset-list' (required if `:coding-type' is `charset' or `shift-jis')
+
+VALUE must be a list of charsets supported by the coding system.
 
-`:charset-list'
+If `coding-type:' is `charset', then on decoding and encoding by the
+coding system, if a character belongs to multiple charsets in the
+list, a charset that comes first in the list is selected.
 
-VALUE must be a list of charsets supported by the coding system.  On
-encoding by the coding system, if a character belongs to multiple
-charsets in the list, a charset that comes earlier in the list is
-selected.  If `:coding-type' is `iso-2022', VALUE may be `iso-2022',
-which indicates that the coding system supports all ISO-2022 based
-charsets.  If `:coding-type' is `emacs-mule', VALUE may be
-`emacs-mule', which indicates that the coding system supports all
-charsets that have the `:emacs-mule-id' property.
+If `:coding-type' is `iso-2022', VALUE may be `iso-2022', which
+indicates that the coding system supports all ISO-2022 based
+charsets.
+
+If `:coding-type' is `shift-jis', VALUE must be a list of three
+to four charsets supported by Shift_JIS encoding scheme.  The
+first charset (one dimension) is for code space 0x00..0x7F, the
+second (one dimension) for 0xA1..0xDF, the third (two dimension)
+for 0x8140..0xEFFC, the optional fourth (three dimension) for
+0xF040..0xFCFC.
+
+If `:coding-type' is `emacs-mule', VALUE may be `emacs-mule',
+which indicates that the coding system supports all charsets that
+have the `:emacs-mule-id' property.
 
 `:ascii-compatible-p'
 
@@ -635,9 +711,9 @@ VALUE must be a translation table to use on encoding.
 VALUE must be a function to call after some text is inserted and
 decoded by the coding system itself and before any functions in
 `after-insert-functions' are called.  This function is passed one
-argument; the number of characters in the text to convert, with
+argument: the number of characters in the text to convert, with
 point at the start of the text.  The function should leave point
-the same, and return the new character count.
+unchanged, and should return the new character count.
 
 `:pre-write-conversion'
 
@@ -666,13 +742,13 @@ to lower case.
 `:mime-text-unsuitable'
 
 VALUE non-nil means the `:mime-charset' property names a charset which
-is unsuitable for the top-level media type \"text\".
+is unsuitable for the top-level media of type \"text\".
 
 `:flags'
 
 VALUE must be a list of symbols that control the ISO-2022 converter.
 Each must be a member of the list `coding-system-iso-2022-flags'
-\(which see).  This attribute has a meaning only when `:coding-type'
+\(which see).  This attribute is meaningful only when `:coding-type'
 is `iso-2022'.
 
 `:designation'
@@ -692,12 +768,12 @@ to GN.  If the list contains 96, any charsets whose whose ranges are
 96 long can be designated to GN.  If the first element is a charset,
 that charset is initially designated to GN.
 
-This attribute has a meaning only when `:coding-type' is `iso-2022'.
+This attribute is meaningful only when `:coding-type' is `iso-2022'.
 
 `:bom'
 
-This attributes specifies whether the coding system uses a `byte order
-mark'.  VALUE must be nil, t, or cons of coding systems whose
+This attributes specifies whether the coding system uses a \"byte order
+mark\".  VALUE must be nil, t, or a cons cell of coding systems whose
 `:coding-type' is `utf-16' or `utf-8'.
 
 If the value is nil, on decoding, don't treat the first two-byte as
@@ -706,13 +782,13 @@ BOM, and on encoding, don't produce BOM bytes.
 If the value is t, on decoding, skip the first two-byte as BOM, and on
 encoding, produce BOM bytes according to the value of `:endian'.
 
-If the value is cons, on decoding, check the first two-byte.  If they
-are 0xFE 0xFF, use the car part coding system of the value.  If they
-are 0xFF 0xFE, use the cdr part coding system of the value.
+If the value is a cons cell, on decoding, check the first two bytes.
+If they are 0xFE 0xFF, use the car part coding system of the value.
+If they are 0xFF 0xFE, use the cdr part coding system of the value.
 Otherwise, treat them as bytes for a normal character.  On encoding,
 produce BOM bytes according to the value of `:endian'.
 
-This attribute has a meaning only when `:coding-type' is `utf-16' or
+This attribute is meaningful only when `:coding-type' is `utf-16' or
 `utf-8'.
 
 `:endian'
@@ -720,19 +796,38 @@ This attribute has a meaning only when `:coding-type' is `utf-16' or
 VALUE must be `big' or `little' specifying big-endian and
 little-endian respectively.  The default value is `big'.
 
-This attribute has a meaning only when `:coding-type' is `utf-16'.
+This attribute is meaningful only when `:coding-type' is `utf-16'.
+
+`:ccl-decoder' (required if :coding-type is `ccl')
+
+VALUE is a CCL program name defined by `define-ccl-program'.  The
+CCL program reads a byte sequence and writes a character sequence
+as a decoding result.
+
+`:ccl-encoder' (required if :coding-type is `ccl')
+
+VALUE is a CCL program name defined by `define-ccl-program'.  The
+CCL program reads a character sequence and writes a byte sequence
+as an encoding result.
 
-`:ccl-decoder'
+`:inhibit-null-byte-detection'
 
-VALUE is a symbol representing the registered CCL program used for
-decoding.  This attribute has a meaning only when `:coding-type' is
-`ccl'.
+VALUE non-nil means Emacs ignore null bytes on code detection.
+See the variable `inhibit-null-byte-detection'.  This attribute
+is meaningful only when `:coding-type' is `undecided'.
 
-`:ccl-encoder'
+`:inhibit-iso-escape-detection'
 
-VALUE is a symbol representing the registered CCL program used for
-encoding.  This attribute has a meaning only when `:coding-type' is
-`ccl'."
+VALUE non-nil means Emacs ignores ISO-2022 escape sequences on
+code detection.  See the variable `inhibit-iso-escape-detection'.
+This attribute is meaningful only when `:coding-type' is
+`undecided'.
+
+`:prefer-utf-8'
+
+VALUE non-nil means Emacs prefers UTF-8 on code detection for
+non-ASCII files.  This attribute is meaningful only when
+`:coding-type' is `undecided'."
   (let* ((common-attrs (mapcar 'list
 			       '(:mnemonic
 				 :coding-type
@@ -761,7 +856,11 @@ encoding.  This attribute has a meaning only when `:coding-type' is
 				   ((eq coding-type 'ccl)
 				    '(:ccl-decoder
 				      :ccl-encoder
-				      :valids))))))
+				      :valids))
+				   ((eq coding-type 'undecided)
+				    '(:inhibit-null-byte-detection
+				      :inhibit-iso-escape-detection
+				      :prefer-utf-8))))))
 
     (dolist (slot common-attrs)
       (setcdr slot (plist-get props (car slot))))
@@ -1236,7 +1335,9 @@ just set the variable `buffer-file-coding-system' directly."
   (if (and coding-system buffer-file-coding-system (null force))
       (setq coding-system
 	    (merge-coding-systems coding-system buffer-file-coding-system)))
-  (when (called-interactively-p 'interactive)
+  (when (and (called-interactively-p 'interactive)
+	     (not (memq 'emacs (coding-system-get coding-system
+						  :charset-list))))
     ;; Check whether save would succeed, and jump to the offending char(s)
     ;; if not.
     (let ((css (find-coding-systems-region (point-min) (point-max))))
@@ -1313,7 +1414,7 @@ graphical terminals."
   (if coding-system
       (setq default-terminal-coding-system coding-system))
   (set-terminal-coding-system-internal coding-system terminal)
-  (redraw-frame (selected-frame)))
+  (redraw-frame))
 
 (defvar default-keyboard-coding-system nil
   "Default value of the keyboard coding system.
@@ -1396,7 +1497,7 @@ use either \\[customize] or \\[set-keyboard-coding-system]."
   :type '(coding-system :tag "Coding system")
   :link '(info-link "(emacs)Terminal Coding")
   :link '(info-link "(emacs)Unibyte Mode")
-  :set (lambda (symbol value)
+  :set (lambda (_symbol value)
 	 ;; Don't load encoded-kb unnecessarily.
 	 (if (or value (boundp 'encoded-kbd-setup-display))
 	     (set-keyboard-coding-system value)
@@ -1691,7 +1792,7 @@ ARC\\|ZIP\\|LZH\\|LHA\\|ZOO\\|[JEW]AR\\|XPI\\|RAR\\|7Z\\)\\'"
     ("\\.\\(gz\\|Z\\|bz\\|bz2\\|xz\\|gpg\\)\\'" . no-conversion)
     ("\\.\\(jpe?g\\|png\\|gif\\|tiff?\\|p[bpgn]m\\)\\'" . no-conversion)
     ("\\.pdf\\'" . no-conversion)
-    ("/#[^/]+#\\'" . emacs-mule)))
+    ("/#[^/]+#\\'" . utf-8-emacs-unix)))
   "Alist of filename patterns vs corresponding coding systems.
 Each element looks like (REGEXP . CODING-SYSTEM).
 A file whose name matches REGEXP is decoded by CODING-SYSTEM on reading.
@@ -1817,7 +1918,7 @@ If nothing is specified, the return value is nil."
 	     (head-end (+ head-start (min size 1024)))
 	     (tail-start (+ head-start (max (- size 3072) 0)))
 	     (tail-end (+ head-start size))
-	     coding-system head-found tail-found pos char-trans)
+	     coding-system head-found tail-found char-trans)
 	;; Try a short cut by searching for the string "coding:"
 	;; and for "unibyte:" at the head and tail of SIZE bytes.
 	(setq head-found (or (search-forward "coding:" head-end t)
@@ -1844,8 +1945,11 @@ If nothing is specified, the return value is nil."
 		       (re-search-forward
 			"\\(.*;\\)?[ \t]*unibyte:[ \t]*\\([^ ;]+\\)"
 			head-end t))
-              (display-warning 'mule "`unibyte: t' is obsolete; \
-use \"coding: 'raw-text\" instead." :warning)
+              (display-warning 'mule
+                               (format "\"unibyte: t\" (in %s) is obsolete; \
+use \"coding: 'raw-text\" instead."
+                                       (file-relative-name filename))
+                               :warning)
 	      (setq coding-system 'raw-text))
 	    (when (and (not coding-system)
 		       (re-search-forward
@@ -1924,11 +2028,10 @@ use \"coding: 'raw-text\" instead." :warning)
       (let ((funcs auto-coding-functions)
 	    (coding-system nil))
 	(while (and funcs (not coding-system))
-	  (setq coding-system (condition-case e
-				  (save-excursion
-				    (goto-char (point-min))
-				    (funcall (pop funcs) size))
-				(error nil))))
+	  (setq coding-system (ignore-errors
+				(save-excursion
+				  (goto-char (point-min))
+				  (funcall (pop funcs) size)))))
 	(if coding-system
 	    (cons coding-system 'auto-coding-functions)))))
 
@@ -2214,7 +2317,13 @@ ALIST is an alist, each element has the form (FROM . TO).
 FROM and TO are a character or a vector of characters.
 If FROM is a character, that character is translated to TO.
 If FROM is a vector of characters, that sequence is translated to TO.
-The first extra-slot of the value is a translation table for reverse mapping."
+The first extra-slot of the value is a translation table for reverse mapping.
+
+FROM and TO may be nil.  If TO is nil, the translation from FROM
+to nothing is defined in the translation table and that element
+is ignored in the reverse map.  If FROM is nil, the translation
+from TO to nothing is defined in the reverse map only.  A vector
+of length zero has the same meaning as specifying nil."
   (let ((tables (vector (make-char-table 'translation-table)
 			(make-char-table 'translation-table)))
 	table max-lookup from to idx val)
@@ -2227,20 +2336,23 @@ The first extra-slot of the value is a translation table for reverse mapping."
 	  (setq from (cdr elt) to (car elt)))
 	(if (characterp from)
 	    (setq idx from)
-	  (setq idx (aref from 0)
-		max-lookup (max max-lookup (length from))))
-	(setq val (aref table idx))
-	(if val
-	    (progn
-	      (or (consp val)
-		  (setq val (list (cons (vector idx) val))))
-	      (if (characterp from)
-		  (setq from (vector from)))
-	      (setq val (nconc val (list (cons from to)))))
-	  (if (characterp from)
-	      (setq val to)
-	    (setq val (list (cons from to)))))
-	(aset table idx val))
+	  (if (= (length from) 0)
+	      (setq idx nil)
+	    (setq idx (aref from 0)
+		  max-lookup (max max-lookup (length from)))))
+	(when idx
+	  (setq val (aref table idx))
+	  (if val
+	      (progn
+		(or (consp val)
+		    (setq val (list (cons (vector idx) val))))
+		(if (characterp from)
+		    (setq from (vector from)))
+		(setq val (nconc val (list (cons from to)))))
+	    (if (characterp from)
+		(setq val to)
+	      (setq val (list (cons from to)))))
+	  (aset table idx val)))
       (set-char-table-extra-slot table 1 max-lookup))
     (set-char-table-extra-slot (aref tables 0) 0 (aref tables 1))
     (aref tables 0)))