1 ;;; code-pages.el --- coding systems for assorted codepages -*-coding: utf-8;-*-
3 ;; Copyright (C) 2001 Free Software Foundation, Inc.
5 ;; Author: Dave Love <fx@gnu.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
27 ;; Definitions of miscellaneous 8-bit coding systems based on ASCII
28 ;; (we can't cope properly with EBCDIC, for instance), mainly for PC
29 ;; `code pages'. They are decoded into Latin-1 and mule-unicode
30 ;; charsets rather than (lossily) into single iso8859 charsets à la
31 ;; codepage.el. The utility `cp-make-coding-system' derives them from
34 ;; Those covered are: cp437, cp737, cp720, cp775, cp850, cp851, cp852,
35 ;; cp855, cp857, cp860, cp861, cp862, cp863, cp864, cp865, cp866,
36 ;; cp869, cp874, cp1125, windows-1250, windows-1251, windows-1252,
37 ;; windows-1253, windows-1254, windows-1255, windows-1256,
38 ;; windows-1257, windows-1258, next, koi8-r, koi8-u, iso-8859-6,
39 ;; iso-8859-10, iso-8859-12, iso-8859-16, koi8-t, georgian-ps. This
40 ;; is meant to include all the single-byte ones relevant to GNU (used
41 ;; in glibc-defined locales); we don't yet get all the multibyte ones
44 ;; Note that koi8-r and cp866 (alternativnyj) clash with the
45 ;; iso8859-5-based versions in cyrillic.el (which should be changed),
46 ;; and others can clash with definitions in codepage.el; we try to
47 ;; avoid damage from that. A few CPs from codepage.el (770, 773, 774)
48 ;; aren't covered (in the absence of translation tables to Unicode).
50 ;; Compile this, to avoid loading `ccl' at runtime.
52 ;; Although the tables used here aren't very big, it might be worth
53 ;; splitting the file and autoloading the coding systems if/when my
54 ;; (or similar) autoloading code is installed.
58 (defun cp-make-translation-table (v)
59 "Return a translation table made from 128-long vector V.
60 V comprises characters encodable by mule-utf-8."
61 (let ((encoding-vector (make-vector 256 0)))
63 (aset encoding-vector i i))
65 (aset encoding-vector (+ i 128) (aref v i)))
66 (make-translation-table-from-vector encoding-vector)))
68 (defun cp-valid-codes (v)
69 "Derive a valid-codes list for translation vector V.
70 See `make-coding-system'."
72 (i 128) ; index into v
73 (start 0) ; start of a valid range
74 (end 127)) ; end of a valid range
76 (if (aref v (- i 128)) ; start or extend range
79 (unless start (setq start i)))
81 (push (cons start end) pairs))
84 (if start (push (cons start end) pairs))
87 (defun cp-fix-safe-chars (cs)
88 "Remove `char-coding-system-table' entries from previous definition of CS.
89 CS is a base coding system or alias."
90 (when (coding-system-p cs)
91 (let ((chars (coding-system-get cs 'safe-chars)))
94 (if (and v (not (eq v t)))
95 (aset char-coding-system-table
97 (remq cs (aref char-coding-system-table v)))))
100 ;; Fix things that have been, or might be done by codepage.el.
101 (eval-after-load "codepage"
104 (dolist (cs '(cp857 cp861 cp1253 cp852 cp866 cp437 cp855 cp869 cp775
105 cp862 cp864 cp1250 cp863 cp865 cp1251 cp737 cp1257 cp850
107 (cp-fix-safe-chars cs))
109 ;; Semi-dummy version for the stuff in codepage.el which we don't
110 ;; define here. (Used by mule-diag.)
111 (defun cp-supported-codepages ()
112 "Return an alist of supported codepages.
114 Each association in the alist has the form (NNN . CHARSET), where NNN is the
115 codepage number, and CHARSET is the MULE charset which is the closest match
116 for the character set supported by that codepage.
118 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
119 is a vector, and has a charset property."
120 '(("774" . latin-iso8859-4) ("770" . latin-iso8859-4)
121 ("773" . latin-iso8859-4)))
123 ;; A version which doesn't override the coding systems set up by this
124 ;; file. It could still be used for the few missing ones from
126 (defun codepage-setup (codepage)
127 "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
129 These coding systems are meant for encoding and decoding 8-bit non-ASCII
130 characters used by the IBM codepages, typically in conjunction with files
131 read/written by MS-DOS software, or for display on the MS-DOS terminal."
133 (let ((completion-ignore-case t)
134 (candidates (cp-supported-codepages)))
135 (list (completing-read "Setup DOS Codepage: (default 437) " candidates
136 nil t nil nil "437"))))
137 (let ((cp (format "cp%s" codepage)))
138 (unless (coding-system-p (intern cp))
139 (cp-make-coding-systems-for-codepage
140 cp (cp-charset-for-codepage cp) (cp-offset-for-codepage cp))))))
143 ;; Somewhat amended from the version in mule-diag.el, needed below.
144 (defvar non-iso-charset-alist
150 (ascii vietnamese-viscii-lower vietnamese-viscii-upper)
151 viet-viscii-nonascii-translation-table
154 (ascii chinese-big5-1 chinese-big5-2)
157 ((?\xA1 ?\xFE) . (?\x40 ?\x7E ?\xA1 ?\xFE))))
159 (ascii katakana-jisx0201 japanese-jisx0208)
161 ((32 127 ?\xA1 ?\xDF)
162 ((?\x81 ?\x9F ?\xE0 ?\xEF) . (?\x40 ?\x7E ?\x80 ?\xFC))))))
164 ;; Macro to allow ccl compilation at byte-compile time, avoiding
167 (defmacro cp-make-coding-system (name v &optional doc-string mnemonic)
168 "Make coding system NAME for and 8-bit, extended-ASCII character set.
169 V is a 128-long vector of characters to translate the upper half of
170 the charactert set. DOC-STRING and MNEMONIC are used as the
171 corresponding args of `make-coding-system'. If MNEMONIC isn't given,
173 (let* ((encoder (intern (format "encode-%s" name)))
174 (decoder (intern (format "decode-%s" name)))
180 (if (r1 < 128) ;; ASCII
181 (r0 = ,(charset-id 'ascii))
183 (r0 = ,(charset-id 'eight-bit-control))
184 (r0 = ,(charset-id 'eight-bit-graphic))))
185 (translate-character ,decoder r0 r1)
186 (write-multibyte-character r0 r1)
192 (read-multibyte-character r0 r1)
193 (translate-character ,encoder r0 r1)
194 (write-repeat r1)))))))
195 `(let ((translation-table (cp-make-translation-table ,v))
196 (codes (cp-valid-codes ,v)))
197 (define-translation-table ',decoder translation-table)
198 (define-translation-table ',encoder
199 (char-table-extra-slot translation-table 0))
200 (cp-fix-safe-chars ',name)
202 ',name 4 ,(or mnemonic ?*)
203 (or ,doc-string (format "%s encoding" ',name))
204 (cons ,ccl-decoder ,ccl-encoder)
205 (list (cons 'safe-chars (get ',encoder 'translation-table))
206 (cons 'valid-codes codes)
207 (cons 'mime-charset ',name)))
211 (let (l) ; code range
212 (dolist (elt (reverse codes))
216 non-iso-charset-alist))))
219 ;; These tables were mostly derived by running somthing like
220 ;; `recode -f cpxxx/..utf-8' on a binary file filled by
221 ;; `(dotimes (i 128) (insert ?? ?\\ (+ 128 i) ?\n))' and then
222 ;; exchanging the ?\� entries for nil. iconv was used instead in some
225 ;; Fixme: Do better for mode-line mnemonics?
227 (cp-make-coding-system
358 (cp-make-coding-system
488 (coding-system-put 'cp737 'mime-charset nil) ; not in IANA list
490 (cp-make-coding-system
621 (cp-make-coding-system
752 (cp-make-coding-system
883 (cp-make-coding-system
1014 (cp-make-coding-system
1145 (cp-make-coding-system
1276 (cp-make-coding-system
1407 (cp-make-coding-system
1538 (cp-make-coding-system
1669 (cp-make-coding-system
1800 (cp-make-coding-system
1931 (cp-make-coding-system
2062 ;; ;; This should be the same as cyrillic-alternativnyj,
2063 ;; ;; (<URL:http://czyborra.com/charsets/cyrillic.html>), but code point
2064 ;; ;; 255 in the cyrillic.el alternativnyj table is `№', i.e. point 240
2065 ;; ;; in 8859-5, not no-break space as below; `№' should be at point 252.
2066 ;; (cp-make-coding-system
2196 ;; "CP866 (Cyrillic Alternativnyj) encoding using Unicode."
2198 ;; (define-coding-system-alias 'alternativnyj 'cp866)
2199 ;; (cp-fix-safe-chars 'cyrillic-alternativnyj)
2200 ;; (define-coding-system-alias 'cyrillic-alternativnyj 'cp866)
2202 (cp-make-coding-system
2333 (cp-make-coding-system
2464 (cp-make-coding-system
2596 (cp-make-coding-system
2728 (cp-make-coding-system
2859 (cp-make-coding-system
2991 (cp-make-coding-system
3123 (cp-make-coding-system
3255 (cp-make-coding-system
3387 (cp-make-coding-system
3518 (cp-make-coding-system
3649 (cp-make-coding-system
3779 "NeXTstep encoding." ?N)
3781 (cp-make-coding-system
3911 "Cyrillic KOI8-U (Ukranian) encoding."
3914 ;; ;; Unicode-based, not cyrillic-iso8859-5 based (and thus incomplete)
3915 ;; ;; like the standard version.
3916 ;; (cp-make-coding-system
3917 ;; ;; The base system uses cyrillic-koi8 as the canonical name, but
3918 ;; ;; that's not a MIME name.
4048 ;; "Unicode-based KOI8 encoding for Cyrillic (MIME: KOI8-R)"
4050 ;; (cp-fix-safe-chars 'cyrillic-koi8)
4051 ;; (define-coding-system-alias 'cyrillic-koi8 'koi8-r)
4052 ;; (define-coding-system-alias 'koi8 'koi8-r)
4053 ;; (define-coding-system-alias 'cp878 'koi8-r)
4055 (cp-make-coding-system
4056 koi8-t ; used by glibc for tg_TJ
4185 "Unicode-based KOI8-T encoding for Cyrillic")
4186 (coding-system-put 'koi8-t 'mime-charset nil) ; not in the IANA list
4188 ;; Online final ISO draft:
4190 ;; http://www.egt.ie/standards/iso8859/fdis8859-16-en.pdf
4192 ;; Equivalent National Standard:
4193 ;; Romanian Standard SR 14111:1998, Romanian Standards Institution
4198 ;; "This set of coded graphic characters is intended for use in data and
4199 ;; text processing applications and also for information interchange. The
4200 ;; set contains graphic characters used for general purpose applications in
4201 ;; typical office environments in at least the following languages:
4202 ;; Albanian, Croatian, English, Finnish, French, German, Hungarian, Irish
4203 ;; Gaelic (new orthography), Italian, Latin, Polish, Romanian, and
4204 ;; Slovenian. This set of coded graphic characters may be regarded as a
4205 ;; version of an 8-bit code according to ISO/IEC 2022 or ISO/IEC 4873 at
4206 ;; level 1." [ISO 8859-16:2001(E), p. 1]
4208 ;; This charset is suitable for use in MIME text body parts.
4210 ;; ISO 8859-16 was primarily designed for single-byte encoding the Romanian
4211 ;; language. The UTF-8 charset is the preferred and in today's MIME software
4212 ;; more widely implemented encoding suitable for Romanian.
4213 (cp-make-coding-system
4214 iso-latin-10 ; consistent with, e.g. Latin-1
4215 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4216 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4313 "Unicode-based encoding for Latin-10 (MIME: ISO-8859-16)"
4315 (coding-system-put 'iso-latin-10 'mime-charset 'iso-8859-16)
4316 (define-coding-system-alias 'iso-8859-16 'iso-latin-10)
4317 (define-coding-system-alias 'latin-10 'iso-latin-10)
4319 ;; Unicode-based alternative which has the possible advantage of
4320 ;; having its relative sparseness specified.
4321 (cp-make-coding-system
4322 ;; The base system uses arabic-iso-8bit, but that's not a MIME charset.
4324 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4325 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4377 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4378 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4379 nil nil nil nil nil nil nil nil nil nil nil]
4380 "Unicode-based Arabic ISO/IEC 8859-6 (MIME: ISO-8859-6)"
4382 (define-coding-system-alias 'arabic-iso-8bit 'iso-8859-6)
4384 (cp-make-coding-system
4386 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4387 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4484 "Unicode-based encoding for Latin-6 (MIME: ISO-8859-10)")
4485 (coding-system-put 'iso-latin-6 'mime-charset 'iso-8859-10)
4486 (define-coding-system-alias 'iso-8859-10 'iso-latin-6)
4487 (define-coding-system-alias 'latin-6 'iso-latin-6)
4489 ;; used by lt_LT, lv_LV, mi_NZ
4490 (cp-make-coding-system
4492 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4493 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4591 "Unicode-based encoding for Latin-7 (MIME: ISO-8859-13)"
4592 ?l) ;; Lithuanian/Latvian
4593 (coding-system-put 'iso-latin-7 'mime-charset 'iso-8859-13)
4594 (define-coding-system-alias 'iso-8859-13 'iso-latin-7)
4595 (define-coding-system-alias 'latin-7 'iso-latin-7)
4597 (cp-make-coding-system
4598 georgian-ps ; used by glibc for ka_GE
4728 (coding-system-put 'georgian-ps 'mime-charset nil) ; not in IANA list
4730 ;; From http://www.microsoft.com/globaldev/reference/oem/720.htm
4731 (cp-make-coding-system
4861 (coding-system-put 'cp720 'mime-charset nil) ; not in IANA list
4863 ;; http://oss.software.ibm.com/cvs/icu/charset/data/ucm/ibm-1125_P100-2000.ucm
4864 (cp-make-coding-system
4994 ;; Original ;name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
4995 (define-coding-system-alias 'cp866u 'cp1125)
4998 (let ((w (intern (format "windows-125%d" i)))
4999 (c (intern (format "cp125%d" i))))
5000 (define-coding-system-alias c w)
5001 ;; Compatibility with codepage.el, though cp... are not the
5003 (push (assoc w non-iso-charset-alist) non-iso-charset-alist)))
5005 ;; Use Unicode font under Windows. Jason Rumney fecit.
5006 (if (and (fboundp 'w32-add-charset-info)
5007 (not (boundp 'w32-unicode-charset-defined)))
5008 (w32-add-charset-info "iso10646-1" 'w32-charset-ansi t))
5010 (provide 'code-pages)
5012 ;;; code-pages.el ends here