Modify the coding system compound-text-with-extensions to conform to the spec of...

author Kenichi Handa <handa@etlken>

Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)

committer Kenichi Handa <handa@etlken>

Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)
author Kenichi Handa <handa@etlken>
Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)
committer Kenichi Handa <handa@etlken>
Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)
diff --git a/lisp/ChangeLog b/lisp/ChangeLog

index 670f07c26834cf691e2f4b71fde346cb998e671d..775ddcdc2e7095fc3400e9c8735a185414ae615d 100644 (file)
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@@ -1,3 +1,20 @@
+2010-08-04  Kenichi Handa  <handa@m17n.org>
+
+       * language/cyrillic.el: Don't add "microsoft-cp1251" to
+       ctext-non-standard-encodings-alist here.
+
+       * international/mule.el (ctext-non-standard-encodings-alist): Add
+       "koi8-r" and "microsoft-cp1251".
+       (ctext-standard-encodings): New variable.
+       (ctext-non-standard-encodings-table): List only elements for
+       non-standard encodings.
+       (ctext-pre-write-conversion): Adjusted for the above change.
+       Check ctext-standard-encodings.
+
+       * international/mule-conf.el (compound-text): Doc fix.
+       (ctext-no-compositions): Doc fix.
+       (compound-text-with-extensions): Doc fix.
+
  2010-07-23  Juanma Barranquero  <lekktu@gmail.com>
  
         * help-fns.el (find-lisp-object-file-name): Doc fix (bug#6494).
diff --git a/lisp/international/mule-conf.el b/lisp/international/mule-conf.el

index f53b69eed8bb17cdfc6613e0b7d0ce89359d14ea..9ee8d22463a0479f27e8df27ef2ca5e636acacbe 100644 (file)
--- a/lisp/international/mule-conf.el
+++ b/lisp/international/mule-conf.el
@@ -1410,9 +1410,10 @@ is treated as a character."
    :flags '(ascii-at-eol ascii-at-cntl designation single-shift composition))
  
  (define-coding-system 'compound-text
-  "Compound text based generic encoding for decoding unknown messages.
-
-This coding system does not support extended segments of CTEXT."
+  "Compound text based generic encoding.
+This coding system is an extension of X's \"Compound Text Encoding\".
+It encodes many characters using the normal ISO-2022 designation sequences,
+but it doesn't support extended segments of CTEXT."
    :coding-type 'iso-2022
    :mnemonic ?x
    :charset-list 'iso-2022
@@ -1432,7 +1433,7 @@ This coding system does not support extended segments of CTEXT."
  ;; not have a mime-charset property, to prevent it from showing up
  ;; close to the beginning of coding systems ordered by priority.
  (define-coding-system 'ctext-no-compositions
- "Compound text based generic encoding for decoding unknown messages.
+ "Compound text based generic encoding.
  
  Like `compound-text', but does not produce escape sequences for compositions."
    :coding-type 'iso-2022
@@ -1445,8 +1446,9 @@ Like `compound-text', but does not produce escape sequences for compositions."
  (define-coding-system 'compound-text-with-extensions
   "Compound text encoding with ICCCM Extended Segment extensions.
  
-See the variable `ctext-non-standard-encodings-alist' for the
-detail about how extended segments are handled.
+See the variables `ctext-standard-encodings' and
+`ctext-non-standard-encodings-alist' for the detail about how
+extended segments are handled.
  
  This coding system should be used only for X selections.  It is inappropriate
  for decoding and encoding files, process I/O, etc."
diff --git a/lisp/international/mule.el b/lisp/international/mule.el

index 7e7e55728c8d91f912508898e0c7567bf4e7d076..e030acbef0209b121e62b3f707d351b88b959ad0 100644 (file)
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1408,7 +1408,9 @@ This function is provided for backward compatibility."
    '(("big5-0" big5 2 big5)
      ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
      ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)
-    ("gbk-0" gbk 2 chinese-gbk)))
+    ("gbk-0" gbk 2 chinese-gbk)
+    ("koi8-r" koi8-r 1 koi8-r)
+    ("microsoft-cp1251" windows-1251 1 windows-1251)))
    "Alist of non-standard encoding names vs the corresponding usages in CTEXT.
  
  It controls how extended segments of a compound text are handled
@@ -1497,6 +1499,20 @@ Each element must be one of the names listed in the variable
        (goto-char (point-min))
        (- (point-max) (point)))))
  
+(defvar ctext-standard-encodings
+  '(ascii latin-jisx0201 katakana-jisx0201
+         latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4
+         greek-iso8859-7 arabic-iso8859-6 hebrew-iso8859-8 cyrillic-iso8859-5
+         latin-iso8859-9
+         chinese-gb2312 japanese-jisx0208 korean-ksc5601)
+  "List of approved standard encodings (i.e. charsets) of X's Compound Text.
+Coding-system `compound-text-with-extensions' encodes a character
+belonging to any of those charsets using the normal ISO2022
+designation sequence unless the current language environment or
+the variable `ctext-non-standard-encodings' decide to use an extended
+segment of CTEXT for that character.  See also the documentation
+of `ctext-non-standard-encodings-alist'.")
+
  ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from
  ;; `ctext-non-standard-encodings' and a list specified by the key
  ;; `ctext-non-standard-encodings' for the currrent language
@@ -1508,77 +1524,74 @@ Each element must be one of the names listed in the variable
  ;; is encoded using UTF-8 encoding extention.
  
  (defun ctext-non-standard-encodings-table ()
-  (let (table)
-    ;; Setup charsets specified by the key
-    ;; `ctext-non-standard-encodings' for the current language
-    ;; environment and in `ctext-non-standard-encodings'.
-    (dolist (encoding (append
-                       (get-language-info current-language-environment
-                                          'ctext-non-standard-encodings)
-                       ctext-non-standard-encodings))
-      (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
+  (let* ((table (append ctext-non-standard-encodings
+                       (copy-sequence
+                        (get-language-info current-language-environment
+                                           'ctext-non-standard-encodings))))
+        (tail table)
+        elt)
+    (while tail
+      (setq elt (car tail))
+      (let* ((slot (assoc elt ctext-non-standard-encodings-alist))
              (charset (nth 3 slot)))
         (if (charsetp charset)
-           (push (cons charset slot) table)
-         (dolist (cs charset)
-           (push (cons cs slot) table)))))
-
-    ;; Next prepend charsets for ISO2022 designation sequence.
-    (dolist (charset charset-list)
-      (let ((final (plist-get (charset-plist charset) :iso-final-char)))
-       (if (and (integerp final)
-                (>= final #x40) (<= final #x7e)
-                ;; Exclude ascii and chinese-cns11643-X.
-                (not (eq charset 'ascii))
-                (not (string-match "cns11643" (symbol-name charset))))
-           (push (cons charset nil) table))))
-
-    ;; Returned reversed list so that the charsets specified by the
-    ;; key `ctext-non-standard-encodings' for the current language
-    ;; have the highest priority.
-    (nreverse table)))
+           (setcar tail (cons charset slot))
+         (setcar tail (cons (car charset) slot))
+         (dolist (cs (cdr charset))
+           (setcdr tail
+                   (cons (cons (car cs) slot) (cdr tail)))
+           (setq tail (cdr tail))))
+       (setq tail (cdr tail))))
+    table))
  
  (defun ctext-pre-write-conversion (from to)
    "Encode characters between FROM and TO as Compound Text w/Extended Segments.
  
-If FROM is a string, or if the current buffer is not the one set up for us
-by `encode-coding-string', generate a new temp buffer, insert the text,
-and convert it in the temporary buffer.  Otherwise, convert in-place."
+If FROM is a string, generate a new temp buffer, insert the text,
+and convert it in the temporary buffer.  Otherwise, convert
+in-place."
    (save-match-data
      ;; Setup a working buffer if necessary.
      (when (stringp from)
        (set-buffer (generate-new-buffer " *temp"))
        (set-buffer-multibyte (multibyte-string-p from))
-      (insert from))
-
-    ;; Now we can encode the whole buffer.
-    (let ((encoding-table (ctext-non-standard-encodings-table))
-         last-coding-system-used
-         last-pos last-encoding-info
-         encoding-info end-pos ch)
-      (goto-char (setq last-pos (point-min)))
-      (setq end-pos (point-marker))
-      (while (re-search-forward "[^\000-\177]+" nil t)
-       ;; Found a sequence of non-ASCII characters.
-       (setq last-pos (match-beginning 0)
-             ch (char-after last-pos)
-             last-encoding-info (catch 'tag
-                                  (dolist (elt encoding-table)
-                                    (if (encode-char ch (car elt))
-                                        (throw 'tag (cdr elt))))
-                                  'utf-8))
-       (set-marker end-pos (match-end 0))
-       (goto-char (1+ last-pos))
-       (catch 'tag
-         (while t
-           (setq encoding-info
-                 (if (< (point) end-pos)
-                     (catch 'tag
-                       (setq ch (following-char))
-                       (dolist (elt encoding-table)
-                         (if (encode-char ch (car elt))
-                             (throw 'tag (cdr elt))))
-                       'utf-8)))
+      (insert from)
+      (setq from 1 to (point-max)))
+    (save-restriction
+      (narrow-to-region from to)
+      (let ((encoding-table (ctext-non-standard-encodings-table))
+           (charset-list ctext-standard-encodings)
+           last-coding-system-used
+           last-pos last-encoding-info
+           encoding-info end-pos ch charset)
+       (dolist (elt encoding-table)
+         (push (car elt) charset-list))
+       (goto-char (setq last-pos from))
+       (setq end-pos (point-marker))
+       (while (re-search-forward "[^\000-\177]+" nil t)
+         ;; Found a sequence of non-ASCII characters.
+         (setq last-pos (match-beginning 0)
+               ch (char-after last-pos)
+               charset (char-charset ch charset-list)
+               last-encoding-info
+               (if charset
+                   (or (cdr (assq charset encoding-table))
+                       charset)
+                 'utf-8))
+         (set-marker end-pos (match-end 0))
+         (goto-char (1+ last-pos))
+         (while (marker-position end-pos)
+           (if (< (point) end-pos)
+               (progn
+                 (setq charset (char-charset (following-char) charset-list)
+                       encoding-info
+                       (if charset
+                           (or (cdr (assq charset encoding-table))
+                               charset)
+                         'utf-8))
+                 (forward-char 1))
+             (setq encoding-info nil)
+             (set-marker end-pos nil))
             (unless (eq last-encoding-info encoding-info)
               (cond ((consp last-encoding-info)
                      ;; Encode the previous range using an extended
@@ -1609,14 +1622,12 @@ and convert it in the temporary buffer.  Otherwise, convert in-place."
                      (save-excursion
                        (goto-char last-pos)
                        (insert "\e%G"))
-                    (insert "\e%@")))
+                    (insert "\e%@"))
+                   (t
+                    (put-text-property last-pos (point) 'charset charset)))
               (setq last-pos (point)
-                   last-encoding-info encoding-info))
-           (if (< (point) end-pos)
-               (forward-char 1)
-             (throw 'tag nil)))))
-      (set-marker end-pos nil)
-      (goto-char (point-min))))
+                   last-encoding-info encoding-info))))
+       (goto-char (point-min)))))
    ;; Must return nil, as build_annotations_2 expects that.
    nil)
  
diff --git a/lisp/language/cyrillic.el b/lisp/language/cyrillic.el

index 7d2f082579f1054165a505307192b22abd7c8a0e..b293ad1ff0b70def122a42cd69b568cdda8d37c0 100644 (file)
--- a/lisp/language/cyrillic.el
+++ b/lisp/language/cyrillic.el
@@ -239,13 +239,6 @@ Support for Russian using koi8-r and the russian-computer input method.")
            (documentation . "Support for Tajik using KOI8-T."))
   '("Cyrillic"))
  
-(let ((elt `("microsoft-cp1251" windows-1251 1
-            ,(get 'encode-windows-1251 'translation-table)))
-      (slot (assoc "microsoft-cp1251" ctext-non-standard-encodings-alist)))
-  (if slot
-      (setcdr slot (cdr elt))
-    (push elt ctext-non-standard-encodings-alist)))
-
  (set-language-info-alist
   "Bulgarian" `((coding-system windows-1251)
                (coding-priority windows-1251)
author	Kenichi Handa <handa@etlken>
	Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)
committer	Kenichi Handa <handa@etlken>
	Wed, 4 Aug 2010 08:06:52 +0000 (17:06 +0900)
lisp/ChangeLog		patch \| blob \| history
lisp/international/mule-conf.el		patch \| blob \| history
lisp/international/mule.el		patch \| blob \| history
lisp/language/cyrillic.el		patch \| blob \| history