X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/1abfd3e85fa9b340699430cd9e15dd9f0073bdbe..8dd58a2d1fedaa16573bc67e986dc2014620c681:/lisp/international/mule.el diff --git a/lisp/international/mule.el b/lisp/international/mule.el index 47c7087518..181474c65c 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el @@ -1,6 +1,6 @@ ;;; mule.el --- basic commands for multilingual environment -;; Copyright (C) 1997-2013 Free Software Foundation, Inc. +;; Copyright (C) 1997-2015 Free Software Foundation, Inc. ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 ;; National Institute of Advanced Industrial Science and Technology (AIST) @@ -407,12 +407,12 @@ PLIST (property list) may contain any type of information a user ;; because that makes a bootstrapping problem ;; if you need to recompile all the Lisp files using interpreted code. -(defun charset-id (charset) +(defun charset-id (_charset) "Always return 0. This is provided for backward compatibility." (declare (obsolete nil "23.1")) 0) -(defmacro charset-bytes (charset) +(defmacro charset-bytes (_charset) "Always return 0. This is provided for backward compatibility." (declare (obsolete nil "23.1")) 0) @@ -471,7 +471,7 @@ Return -1 if charset isn't an ISO 2022 one." ;;; CHARACTER (define-obsolete-function-alias 'char-valid-p 'characterp "23.1") -(defun generic-char-p (char) +(defun generic-char-p (_char) "Always return nil. This is provided for backward compatibility." (declare (obsolete nil "23.1")) nil) @@ -518,7 +518,8 @@ Return -1 if charset isn't an ISO 2022 one." composition euc-tw-shift use-roman - use-oldjis) + use-oldjis + 8-bit-level-4) "List of symbols that control ISO-2022 encoder/decoder. The value of the `:flags' attribute in the argument of the function @@ -542,8 +543,9 @@ If `locking-shift' is specified, decode locking-shift code correctly on decoding, and use locking-shift to invoke a graphic element on encoding. -If `single-shift' is specified, decode single-shift code correctly on -decoding, and use single-shift to invoke a graphic element on encoding. +If `single-shift' is specified, decode single-shift code +correctly on decoding, and use single-shift to invoke a graphic +element on encoding. See also `8-bit-level-4' specification. If `designation' is specified, decode designation code correctly on decoding, and use designation to designate a charset to a graphic @@ -578,13 +580,42 @@ If `use-roman' is specified, JIS0201-1976-Roman is designated instead of ASCII. If `use-oldjis' is specified, JIS0208-1976 is designated instead of -JIS0208-1983.") +JIS0208-1983. + +If `8-bit-level-4' is specified, the decoder assumes the +implementation level \"4\" for 8-bit codes which means that GL is +identified as the single-shift area. The default implementation +level for 8-bit code is \"4A\" which means that GR is identified +as the single-shift area.") (defun define-coding-system (name docstring &rest props) "Define NAME (a symbol) as a coding system with DOCSTRING and attributes. The remaining arguments must come in pairs ATTRIBUTE VALUE. ATTRIBUTE may be any symbol. +A coding system specifies a rule to decode (i.e. to convert a +byte sequence to a character sequence) and a rule to encode (the +opposite of decoding). + +The decoding is done by at most 3 steps; the first is to convert +a byte sequence to a character sequence by one of Emacs' +internal routines specified by `:coding-type' attribute. The +optional second step is to convert the character sequence (the +result of the first step) by a translation table specified +by `:decode-translation-table' attribute. The optional third step +is to convert the above result by a Lisp function specified +by `:post-read-conversion' attribute. + +The encoding is done by at most 3 steps, which are the reverse +of the decoding steps. The optional first step converts a +character sequence to another character sequence by a Lisp +function specified by `:pre-write-conversion' attribute. The +optional second step converts the above result by a translation +table specified by `:encode-translation-table' attribute. The +third step converts the above result to a byte sequence by one +of the Emacs's internal routines specified by the `:coding-type' +attribute. + The following attributes have special meanings. Those labeled as \"(required)\" should not be omitted. @@ -594,27 +625,72 @@ VALUE is a character to display on mode line for the coding system. `:coding-type' (required) -VALUE must be one of `charset', `utf-8', `utf-16', `iso-2022', -`emacs-mule', `shift-jis', `ccl', `raw-text', `undecided'. +VALUE specifies the format of byte sequence the coding system +decodes and encodes to. It must be one of `charset', `utf-8', +`utf-16', `iso-2022', `emacs-mule', `shift-jis', `ccl', +`raw-text', `undecided'. + +If VALUE is `charset', the coding system is for handling a +byte sequence in which each byte or every two- to four-byte +sequence represents a character code of a charset specified +by the `:charset-list' attribute. + +If VALUE is `utf-8', the coding system is for handling Unicode +UTF-8 byte sequences. See also the documentation of the +attribute `:bom'. + +If VALUE is `utf-16', the coding system is for handling Unicode +UTF-16 byte sequences. See also the documentation of the +attributes :bom and `:endian'. + +If VALUE is `iso-2022', the coding system is for handling byte +sequences conforming to ISO/IEC 2022. See also the documentation +of the attributes `:charset-list', `:flags', and `:designation'. + +If VALUE is `emacs-mule', the coding system is for handling +byte sequences which Emacs 20 and 21 used for their internal +representation of characters. + +If VALUE is `shift-jis', the coding system is for handling byte +sequences of Shift_JIS format. See also the attribute `:charset-list'. + +If VALUE is `ccl', the coding system uses CCL programs to decode +and encode byte sequences. The CCL programs must be +specified by the attributes `:ccl-decoder' and `:ccl-encoder'. + +If VALUE is `raw-text', the coding system decodes byte sequences +without any conversions. `:eol-type' VALUE is the EOL (end-of-line) format of the coding system. It must be one of `unix', `dos', `mac'. The symbol `unix' means Unix-like EOL -\(i.e. single LF), `dos' means DOS-like EOL \(i.e. sequence of CR LF), -and `mac' means Mac-like EOL \(i.e. single CR). If omitted, Emacs -detects the EOL format automatically when decoding. +\(i.e. a single LF character), `dos' means DOS-like EOL \(i.e. a sequence +of CR followed by LF), and `mac' means Mac-like EOL \(i.e. a single CR). +If omitted, Emacs detects the EOL format automatically when decoding. + +`:charset-list' (required if `:coding-type' is `charset' or `shift-jis') + +VALUE must be a list of charsets supported by the coding system. -`:charset-list' +If `coding-type:' is `charset', then on decoding and encoding by the +coding system, if a character belongs to multiple charsets in the +list, a charset that comes first in the list is selected. -VALUE must be a list of charsets supported by the coding system. On -encoding by the coding system, if a character belongs to multiple -charsets in the list, a charset that comes earlier in the list is -selected. If `:coding-type' is `iso-2022', VALUE may be `iso-2022', -which indicates that the coding system supports all ISO-2022 based -charsets. If `:coding-type' is `emacs-mule', VALUE may be -`emacs-mule', which indicates that the coding system supports all -charsets that have the `:emacs-mule-id' property. +If `:coding-type' is `iso-2022', VALUE may be `iso-2022', which +indicates that the coding system supports all ISO-2022 based +charsets. + +If `:coding-type' is `shift-jis', VALUE must be a list of three +to four charsets supported by Shift_JIS encoding scheme. The +first charset (one dimension) is for code space 0x00..0x7F, the +second (one dimension) for 0xA1..0xDF, the third (two dimension) +for 0x8140..0xEFFC, the optional fourth (three dimension) for +0xF040..0xFCFC. + +If `:coding-type' is `emacs-mule', VALUE may be `emacs-mule', +which indicates that the coding system supports all charsets that +have the `:emacs-mule-id' property. `:ascii-compatible-p' @@ -635,9 +711,9 @@ VALUE must be a translation table to use on encoding. VALUE must be a function to call after some text is inserted and decoded by the coding system itself and before any functions in `after-insert-functions' are called. This function is passed one -argument; the number of characters in the text to convert, with +argument: the number of characters in the text to convert, with point at the start of the text. The function should leave point -the same, and return the new character count. +unchanged, and should return the new character count. `:pre-write-conversion' @@ -666,13 +742,13 @@ to lower case. `:mime-text-unsuitable' VALUE non-nil means the `:mime-charset' property names a charset which -is unsuitable for the top-level media type \"text\". +is unsuitable for the top-level media of type \"text\". `:flags' VALUE must be a list of symbols that control the ISO-2022 converter. Each must be a member of the list `coding-system-iso-2022-flags' -\(which see). This attribute has a meaning only when `:coding-type' +\(which see). This attribute is meaningful only when `:coding-type' is `iso-2022'. `:designation' @@ -692,12 +768,12 @@ to GN. If the list contains 96, any charsets whose whose ranges are 96 long can be designated to GN. If the first element is a charset, that charset is initially designated to GN. -This attribute has a meaning only when `:coding-type' is `iso-2022'. +This attribute is meaningful only when `:coding-type' is `iso-2022'. `:bom' -This attributes specifies whether the coding system uses a `byte order -mark'. VALUE must be nil, t, or cons of coding systems whose +This attributes specifies whether the coding system uses a \"byte order +mark\". VALUE must be nil, t, or a cons cell of coding systems whose `:coding-type' is `utf-16' or `utf-8'. If the value is nil, on decoding, don't treat the first two-byte as @@ -706,13 +782,13 @@ BOM, and on encoding, don't produce BOM bytes. If the value is t, on decoding, skip the first two-byte as BOM, and on encoding, produce BOM bytes according to the value of `:endian'. -If the value is cons, on decoding, check the first two-byte. If they -are 0xFE 0xFF, use the car part coding system of the value. If they -are 0xFF 0xFE, use the cdr part coding system of the value. +If the value is a cons cell, on decoding, check the first two bytes. +If they are 0xFE 0xFF, use the car part coding system of the value. +If they are 0xFF 0xFE, use the cdr part coding system of the value. Otherwise, treat them as bytes for a normal character. On encoding, produce BOM bytes according to the value of `:endian'. -This attribute has a meaning only when `:coding-type' is `utf-16' or +This attribute is meaningful only when `:coding-type' is `utf-16' or `utf-8'. `:endian' @@ -720,19 +796,38 @@ This attribute has a meaning only when `:coding-type' is `utf-16' or VALUE must be `big' or `little' specifying big-endian and little-endian respectively. The default value is `big'. -This attribute has a meaning only when `:coding-type' is `utf-16'. +This attribute is meaningful only when `:coding-type' is `utf-16'. + +`:ccl-decoder' (required if :coding-type is `ccl') + +VALUE is a CCL program name defined by `define-ccl-program'. The +CCL program reads a byte sequence and writes a character sequence +as a decoding result. + +`:ccl-encoder' (required if :coding-type is `ccl') + +VALUE is a CCL program name defined by `define-ccl-program'. The +CCL program reads a character sequence and writes a byte sequence +as an encoding result. -`:ccl-decoder' +`:inhibit-null-byte-detection' -VALUE is a symbol representing the registered CCL program used for -decoding. This attribute has a meaning only when `:coding-type' is -`ccl'. +VALUE non-nil means Emacs ignore null bytes on code detection. +See the variable `inhibit-null-byte-detection'. This attribute +is meaningful only when `:coding-type' is `undecided'. -`:ccl-encoder' +`:inhibit-iso-escape-detection' -VALUE is a symbol representing the registered CCL program used for -encoding. This attribute has a meaning only when `:coding-type' is -`ccl'." +VALUE non-nil means Emacs ignores ISO-2022 escape sequences on +code detection. See the variable `inhibit-iso-escape-detection'. +This attribute is meaningful only when `:coding-type' is +`undecided'. + +`:prefer-utf-8' + +VALUE non-nil means Emacs prefers UTF-8 on code detection for +non-ASCII files. This attribute is meaningful only when +`:coding-type' is `undecided'." (let* ((common-attrs (mapcar 'list '(:mnemonic :coding-type @@ -761,7 +856,11 @@ encoding. This attribute has a meaning only when `:coding-type' is ((eq coding-type 'ccl) '(:ccl-decoder :ccl-encoder - :valids)))))) + :valids)) + ((eq coding-type 'undecided) + '(:inhibit-null-byte-detection + :inhibit-iso-escape-detection + :prefer-utf-8)))))) (dolist (slot common-attrs) (setcdr slot (plist-get props (car slot)))) @@ -1236,7 +1335,9 @@ just set the variable `buffer-file-coding-system' directly." (if (and coding-system buffer-file-coding-system (null force)) (setq coding-system (merge-coding-systems coding-system buffer-file-coding-system))) - (when (called-interactively-p 'interactive) + (when (and (called-interactively-p 'interactive) + (not (memq 'emacs (coding-system-get coding-system + :charset-list)))) ;; Check whether save would succeed, and jump to the offending char(s) ;; if not. (let ((css (find-coding-systems-region (point-min) (point-max)))) @@ -1313,7 +1414,7 @@ graphical terminals." (if coding-system (setq default-terminal-coding-system coding-system)) (set-terminal-coding-system-internal coding-system terminal) - (redraw-frame (selected-frame))) + (redraw-frame)) (defvar default-keyboard-coding-system nil "Default value of the keyboard coding system. @@ -1396,7 +1497,7 @@ use either \\[customize] or \\[set-keyboard-coding-system]." :type '(coding-system :tag "Coding system") :link '(info-link "(emacs)Terminal Coding") :link '(info-link "(emacs)Unibyte Mode") - :set (lambda (symbol value) + :set (lambda (_symbol value) ;; Don't load encoded-kb unnecessarily. (if (or value (boundp 'encoded-kbd-setup-display)) (set-keyboard-coding-system value) @@ -1691,7 +1792,7 @@ ARC\\|ZIP\\|LZH\\|LHA\\|ZOO\\|[JEW]AR\\|XPI\\|RAR\\|7Z\\)\\'" ("\\.\\(gz\\|Z\\|bz\\|bz2\\|xz\\|gpg\\)\\'" . no-conversion) ("\\.\\(jpe?g\\|png\\|gif\\|tiff?\\|p[bpgn]m\\)\\'" . no-conversion) ("\\.pdf\\'" . no-conversion) - ("/#[^/]+#\\'" . emacs-mule))) + ("/#[^/]+#\\'" . utf-8-emacs-unix))) "Alist of filename patterns vs corresponding coding systems. Each element looks like (REGEXP . CODING-SYSTEM). A file whose name matches REGEXP is decoded by CODING-SYSTEM on reading. @@ -1817,7 +1918,7 @@ If nothing is specified, the return value is nil." (head-end (+ head-start (min size 1024))) (tail-start (+ head-start (max (- size 3072) 0))) (tail-end (+ head-start size)) - coding-system head-found tail-found pos char-trans) + coding-system head-found tail-found char-trans) ;; Try a short cut by searching for the string "coding:" ;; and for "unibyte:" at the head and tail of SIZE bytes. (setq head-found (or (search-forward "coding:" head-end t) @@ -1844,8 +1945,11 @@ If nothing is specified, the return value is nil." (re-search-forward "\\(.*;\\)?[ \t]*unibyte:[ \t]*\\([^ ;]+\\)" head-end t)) - (display-warning 'mule "`unibyte: t' is obsolete; \ -use \"coding: 'raw-text\" instead." :warning) + (display-warning 'mule + (format "\"unibyte: t\" (in %s) is obsolete; \ +use \"coding: 'raw-text\" instead." + (file-relative-name filename)) + :warning) (setq coding-system 'raw-text)) (when (and (not coding-system) (re-search-forward @@ -1924,11 +2028,10 @@ use \"coding: 'raw-text\" instead." :warning) (let ((funcs auto-coding-functions) (coding-system nil)) (while (and funcs (not coding-system)) - (setq coding-system (condition-case e - (save-excursion - (goto-char (point-min)) - (funcall (pop funcs) size)) - (error nil)))) + (setq coding-system (ignore-errors + (save-excursion + (goto-char (point-min)) + (funcall (pop funcs) size))))) (if coding-system (cons coding-system 'auto-coding-functions))))) @@ -2214,7 +2317,13 @@ ALIST is an alist, each element has the form (FROM . TO). FROM and TO are a character or a vector of characters. If FROM is a character, that character is translated to TO. If FROM is a vector of characters, that sequence is translated to TO. -The first extra-slot of the value is a translation table for reverse mapping." +The first extra-slot of the value is a translation table for reverse mapping. + +FROM and TO may be nil. If TO is nil, the translation from FROM +to nothing is defined in the translation table and that element +is ignored in the reverse map. If FROM is nil, the translation +from TO to nothing is defined in the reverse map only. A vector +of length zero has the same meaning as specifying nil." (let ((tables (vector (make-char-table 'translation-table) (make-char-table 'translation-table))) table max-lookup from to idx val) @@ -2227,20 +2336,23 @@ The first extra-slot of the value is a translation table for reverse mapping." (setq from (cdr elt) to (car elt))) (if (characterp from) (setq idx from) - (setq idx (aref from 0) - max-lookup (max max-lookup (length from)))) - (setq val (aref table idx)) - (if val - (progn - (or (consp val) - (setq val (list (cons (vector idx) val)))) - (if (characterp from) - (setq from (vector from))) - (setq val (nconc val (list (cons from to))))) - (if (characterp from) - (setq val to) - (setq val (list (cons from to))))) - (aset table idx val)) + (if (= (length from) 0) + (setq idx nil) + (setq idx (aref from 0) + max-lookup (max max-lookup (length from))))) + (when idx + (setq val (aref table idx)) + (if val + (progn + (or (consp val) + (setq val (list (cons (vector idx) val)))) + (if (characterp from) + (setq from (vector from))) + (setq val (nconc val (list (cons from to))))) + (if (characterp from) + (setq val to) + (setq val (list (cons from to))))) + (aset table idx val))) (set-char-table-extra-slot table 1 max-lookup)) (set-char-table-extra-slot (aref tables 0) 0 (aref tables 1)) (aref tables 0)))