code.delx.au - gnu-emacs/blob - lisp/international/codepage.el

   1 ;;; codepage.el --- MS-DOS specific coding systems.
   2
   3 ;; Copyright (C) 1998 Free Software Foundation, Inc.
   4
   5 ;; Author: Eli Zaretskii
   6 ;; Maintainer: FSF
   7 ;; Keywords: i18n ms-dos codepage
   8
   9 ;; This file is part of GNU Emacs.
  10
  11 ;; GNU Emacs is free software; you can redistribute it and/or modify
  12 ;; it under the terms of the GNU General Public License as published by
  13 ;; the Free Software Foundation; either version 2, or (at your option)
  14 ;; any later version.
  15
  16 ;; GNU Emacs is distributed in the hope that it will be useful,
  17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 ;; GNU General Public License for more details.
  20
  21 ;; You should have received a copy of the GNU General Public License
  22 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 ;; Boston, MA 02111-1307, USA.
  25
  26 ;;; Commentary:
  27
  28 ;; Special coding systems for DOS codepage support.
  29 ;;
  30 ;; These coding systems perform conversion from the DOS codepage encoding
  31 ;; to one of the ISO-8859 character sets.  Each codepage has its corresponding
  32 ;; ISO-8859 charset, chosen so as to be able to convert all (or most) of the
  33 ;; characters.  The idea is that Emacs internally works with the usual MULE
  34 ;; charsets, and the conversion to and from the DOS codepage is performed
  35 ;; on I/O only.
  36 ;; See term/internal.el for the complementary setup of the DOS terminal
  37 ;; display and input methods.
  38 ;;
  39 ;; Thanks to Ken'ichi Handa <handa@etl.go.jp> for writing the CCL
  40 ;; encoders/decoders, and for help in debugging this code.
  41
  42 ;;; Code:
  43
  44 (defun cp-coding-system-for-codepage-1 (coding mnemonic iso-name
  45                                                decoder encoder)
  46   "Make coding system CODING for a DOS codepage using translation tables.
  47 MNEMONIC is a character to be displayed on mode line for the coding system.
  48 ISO-NAME is the name of the ISO-8859 charset which corresponds to this
  49 codepage.
  50 DECODER is a translation table for converting characters in the DOS codepage
  51 encoding to Emacs multibyte characters.
  52 ENCODER is a translation table for encoding Emacs multibyte characters into
  53 external DOS codepage codes.
  54
  55 Note that the coding systems created by this function support automatic
  56 detection of the EOL format.  However, the decoders and encoders created
  57 for these coding systems only support DOS and Unix style EOLs (the -mac
  58 variety is actually just an alias for the -unix variety)."
  59   (save-match-data
  60     (let* ((coding-name (symbol-name coding))
  61            (undef (if (eq system-type 'ms-dos)
  62                       (if dos-unsupported-char-glyph
  63                           (logand dos-unsupported-char-glyph 255)
  64                         127)
  65                     ??))
  66            (ccl-decoder-dos
  67             (ccl-compile
  68              `(4 (loop (read r1)
  69                        (if (r1 != ?\r)
  70                            (if (r1 >= 128)
  71                                ((r0 = ,(charset-id 'ascii))
  72                                 (translate-character ,decoder r0 r1)
  73                                 (if (r0 == ,(charset-id 'ascii))
  74                                     (write r1)
  75                                   (write-multibyte-character r0 r1)))
  76                              (write r1)))
  77                        (repeat)))))
  78            (ccl-decoder-unix
  79             (ccl-compile
  80              `(4 (loop (read r1)
  81                        (if (r1 >= 128)
  82                            ((r0 = ,(charset-id 'ascii))
  83                             (translate-character ,decoder r0 r1)
  84                             (if (r0 == ,(charset-id 'ascii))
  85                                 (write r1)
  86                               (write-multibyte-character r0 r1)))
  87                          (write r1))
  88                        (repeat)))))
  89            (ccl-encoder-dos
  90             (ccl-compile
  91              ;; The 2 here supplies the buf_magnification parameter for
  92              ;; the CCL program.  Since the -dos coding system generates
  93              ;; \r\n for each \n, a factor of 2 covers even the worst case
  94              ;; of empty lines with a single \n.
  95              `(2 (loop (read-multibyte-character r0 r1)
  96                        (if (r1 == ?\n)
  97                            (write ?\r)
  98                          (if (r0 != ,(charset-id 'ascii))
  99                              ((translate-character ,encoder r0 r1)
 100                               (if (r0 == ,(charset-id 'japanese-jisx0208))
 101                                   ((r1 = ,undef)
 102                                    (write r1))))))
 103                        (write-repeat r1)))))
 104            (ccl-encoder-unix
 105             (ccl-compile
 106              `(1 (loop (read-multibyte-character r0 r1)
 107                        (if (r0 != ,(charset-id 'ascii))
 108                            ((translate-character ,encoder r0 r1)
 109                             (if (r0 == ,(charset-id 'japanese-jisx0208))
 110                                 ((r1 = ,undef)
 111                                  (write r1)))))
 112                        (write-repeat r1))))))
 113       (if (memq coding coding-system-list)
 114           (setq coding-system-list (delq coding coding-system-list)))
 115
 116       ;; Make coding system CODING.
 117       (make-coding-system
 118        coding 4 mnemonic
 119        (concat "8-bit encoding of " (symbol-name iso-name)
 120                " characters using IBM codepage " coding-name)
 121        (cons ccl-decoder-unix ccl-encoder-unix)
 122        `((safe-charsets ascii eight-bit-control eight-bit-graphic ,iso-name)
 123          (valid-codes (0 . 255))
 124          (charset-origin-alist ,(list iso-name (symbol-name coding) encoder))))
 125       ;;; Make coding systems CODING-unix, CODING-dos, CODING-mac.
 126       (make-subsidiary-coding-system coding)
 127       (put coding 'eol-type (vector (intern (format "%s-unix" coding))
 128                                     (intern (format "%s-dos" coding))
 129                                     (intern (format "%s-mac" coding))))
 130       ;; Change CCL code for CODING-dos.
 131       (let ((coding-spec (copy-sequence (get coding 'coding-system))))
 132         (aset coding-spec 4
 133               (cons (check-ccl-program
 134                      ccl-decoder-dos
 135                      (intern (format "%s-dos-decoder" coding)))
 136                     (check-ccl-program
 137                      ccl-encoder-dos
 138                      (intern (format "%s-dos-encoder" coding)))))
 139         (put (intern (concat coding-name "-dos")) 'coding-system
 140              coding-spec)))))
 141
 142 (defun cp-decoding-vector-for-codepage (table charset offset)
 143   "Create a vector for decoding IBM PC characters using conversion table
 144 TABLE into an ISO-8859 character set CHARSET whose first non-ASCII
 145 character is generated by (make-char CHARSET OFFSET)."
 146   (let* ((len (length table))
 147          (undefined-char
 148           (if (eq system-type 'ms-dos)
 149               (if dos-unsupported-char-glyph
 150                   (logand dos-unsupported-char-glyph 255)
 151                 127)
 152             32))
 153          (vec1 (make-vector 256 undefined-char))
 154          (i 0))
 155     (while (< i 256)
 156       (aset vec1 i i)
 157       (setq i (1+ i)))
 158     (setq i 0)
 159     (while (< i len)
 160       (if (aref table i)
 161           (aset vec1 (aref table i) (make-char charset (+ i offset))))
 162       (setq i (1+ i)))
 163     vec1))
 164
 165 ;;; You don't think I created all these tables below by hand, do you?
 166 ;;; The following Awk script will create the table for cp850-to-Latin-1
 167 ;;; conversion from the RFC 1345 file (the other tables are left as an
 168 ;;; excercise):
 169 ;;; BEGIN { n_pages = 11;
 170 ;;;         pn["IBM437"] = 0; pn["IBM850"] = 1; pn["IBM851"] = 2;
 171 ;;;         pn["IBM852"] = 3; pn["IBM855"] = 4; pn["IBM860"] = 5;
 172 ;;;         pn["IBM861"] = 6; pn["IBM862"] = 7; pn["IBM863"] = 8;
 173 ;;;         pn["IBM864"] = 9; pn["IBM865"] = 10;
 174 ;;;       }
 175 ;;; $1 == "&charset" { charset = $2; }
 176 ;;; $1 == "&code"    { code = $2; }
 177 ;;; /^  [^&]/  {
 178 ;;;   if ((charset ~ /^IBM(437|8(5[0125]|6[0-5]))$/) || (charset ~ /^ISO_8859-1/))
 179 ;;;     {
 180 ;;;       for (i = 1; i <= NF; i++)
 181 ;;;         chars[charset,code++] = $i;
 182 ;;;     }
 183 ;;;   }
 184 ;;;
 185 ;;; END {
 186 ;;;   for (i = 160; i < 256; i++)
 187 ;;;     {
 188 ;;;       c =  chars["ISO_8859-1:1987",i];
 189 ;;;       if (c == "??")        # skip unused positions
 190 ;;;         {
 191 ;;;           printf " nil";
 192 ;;;           if ((i - 159)%16 == 0)
 193 ;;;             printf "\n";
 194 ;;;           continue;
 195 ;;;         }
 196 ;;;       found = 0;
 197 ;;;       for (j in pn)
 198 ;;;         map[j] = "nil";
 199 ;;;       for (combined in chars)
 200 ;;;         {
 201 ;;;           candidate = chars[combined];
 202 ;;;           split (combined, separate, SUBSEP);
 203 ;;;           if (separate[1] == "IBM850" && candidate == c)
 204 ;;;             {
 205 ;;;               found = 1;
 206 ;;;               map[separate[1]] = separate[2];
 207 ;;;             }
 208 ;;;         }
 209 ;;;       printf " %s", map["IBM850"];
 210 ;;;       if ((i - 159)%16 == 0)
 211 ;;;         printf "\n";
 212 ;;;     }
 213 ;;; }
 214
 215 ;;; WARNING WARNING WARNING!!!
 216 ;;;
 217 ;;; If you want to get fancy with these tables, remember that the inverse
 218 ;;; tables, created by `cp-decoding-vector-for-codepage' above, are installed
 219 ;;; on MS-DOS as nonascii-translation-table (see `dos-codepage-setup' on
 220 ;;; internal.el).  Therefore, you should NOT put any codes below 128 in
 221 ;;; these tables!  Otherwise, various Emacs commands and functions will
 222 ;;; mysteriously fail!  For example, a typical screwup is to map the Latin-N
 223 ;;; acute accent character to the apostrophe, and have all regexps which
 224 ;;; end with "\\'" begin to fail (e.g., the automatic setting of the major
 225 ;;; mode by file name extension will stop working).
 226 ;;;
 227 ;;; You HAVE BEEN warned!
 228
 229 ;; US/English/PC-8/IBM-2.  This doesn't support Latin-1 characters very
 230 ;; well, but why not use what we can salvage?
 231 (defvar cp437-decode-table
 232   ;; Nth element is the code of a cp437 glyph for the multibyte
 233   ;; character created by (make-char 'latin-iso8859-1 (+ N 160)).
 234   ;; The element nil means there's no corresponding cp437 glyph.
 235   [
 236    255 173 155 156 nil 157 179 nil nil nil 166 174 170 196 nil nil
 237    248 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168
 238    nil nil nil nil 142 143 146 128 nil 144 nil nil nil nil nil nil
 239    nil 165 nil nil nil nil 153 nil nil nil nil nil 154 nil nil 225
 240    133 160 131 nil 132 134 145 135 138 130 136 137 141 161 140 139
 241    nil 164 149 162 147 nil 148 246 nil 151 163 150 129 nil nil 152]
 242   "Table for converting ISO-8859-1 characters into codepage 437 glyphs.")
 243 (setplist 'cp437-decode-table
 244           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 245
 246 ;; Multilingual (Latin-1)
 247 (defvar cp850-decode-table
 248   ;; Nth element is the code of a cp850 glyph for the multibyte
 249   ;; character created by (make-char 'latin-iso8859-1 (+ N 160)).
 250   ;; The element nil means there's no corresponding cp850 glyph.
 251   [
 252    255 173 189 156 207 190 221 245 249 184 166 174 170 240 169 nil
 253    248 241 253 252 239 230 244 250 247 251 167 175 172 171 243 168
 254    183 181 182 199 142 143 146 128 212 144 210 211 222 214 215 216
 255    209 165 227 224 226 229 153 158 157 235 233 234 154 237 231 225
 256    133 160 131 198 132 134 145 135 138 130 136 137 141 161 140 139
 257    208 164 149 162 147 228 148 246 155 151 163 150 129 236 232 152]
 258   "Table for converting ISO-8859-1 characters into codepage 850 glyphs.")
 259 (setplist 'cp850-decode-table
 260           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 261
 262 ;; Greek
 263 (defvar cp851-decode-table
 264   [
 265    255 nil nil 156 nil nil nil 245 249 nil nil 174 nil 240 nil nil
 266    248 241 nil nil 239 nil 134 nil 141 143 144 175 146 171 149 152
 267    161 164 165 166 167 168 169 170 172 173 181 182 184 183 189 190
 268    198 199 nil 207 208 209 210 211 212 213 nil nil 155 157 158 159
 269    252 214 215 216 221 222 224 225 226 227 228 229 230 231 232 233
 270    234 235 237 236 238 242 243 244 246 250 160 251 162 163 253 nil]
 271   "Table for converting ISO-8859-7 characters into codepage 851 glyphs.")
 272 (setplist 'cp851-decode-table
 273           '(charset greek-iso8859-7 language "Greek" offset 160))
 274
 275 ;; Slavic/Eastern Europe (Latin-2)
 276 (defvar cp852-decode-table
 277   [
 278    255 164 244 157 207 149 151 245 249 230 184 155 141 240 166 189
 279    248 165 247 136 239 150 152 243 242 231 173 156 171 241 167 190
 280    232 181 182 198 142 145 143 128 172 144 168 211 183 214 215 210
 281    209 227 213 224 226 138 153 158 252 222 233 235 154 237 221 225
 282    234 160 131 199 132 146 134 135 159 130 169 137 216 161 140 212
 283    208 228 229 162 147 139 148 246 253 133 163 251 129 236 238 250]
 284   "Table for converting ISO-8859-2 characters into codepage 852 glyphs.")
 285 (setplist 'cp852-decode-table
 286           '(charset latin-iso8859-2 language "Latin-2" offset 160))
 287
 288 ;; Russian
 289 (defvar cp855-decode-table
 290   [
 291    255 133 129 131 135 137 139 141 143 145 147 149 151 240 153 155
 292    161 163 236 173 167 169 234 244 184 190 199 209 211 213 215 221
 293    226 228 230 232 171 182 165 252 246 250 159 242 238 248 157 224
 294    160 162 235 172 166 168 233 243 183 189 198 208 210 212 214 216
 295    225 227 229 231 170 181 164 251 245 249 158 241 237 247 156 222
 296    239 132 128 130 134 136 138 140 142 144 146 148 150 253 152 154]
 297   "Table for converting ISO-8859-5 characters into codepage 855 glyphs.")
 298 (setplist 'cp855-decode-table
 299           '(charset cyrillic-iso8859-5 language "Cyrillic-ISO" offset 160))
 300
 301 ;; Turkish
 302 (defvar cp857-decode-table
 303   [
 304    255 nil nil 156 207 nil 245 249 152 158 166 nil 240 nil
 305    248 nil 253 252 239 nil nil nil nil 141 159 167 nil 171 nil
 306    183 181 182 142 nil nil 128 212 144 210 211 222 214 215 216
 307    165 227 224 226 nil 153 232 nil 235 233 234 154 nil nil 225
 308    133 160 131 132 nil nil 135 138 130 136 137 236 161 140 139
 309    164 149 162 147 nil 148 246 nil 151 163 150 129 nil nil 250]
 310   "Table for converting ISO-8859-3 characters into codepage 857 glyphs.")
 311 (setplist 'cp857-decode-table
 312           '(charset latin-iso8859-3 language "Latin-3" offset 160))
 313
 314 ;; Portuguese
 315 (defvar cp860-decode-table
 316   [
 317    255 173 155 156 nil nil 179 nil nil nil 166 174 170 nil nil nil
 318    nil 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168
 319    145 134 143 142 nil nil nil 128 146 144 137 nil 152 nil 139 nil
 320    nil 165 159 169 140 153 nil nil nil 157 150 nil 154 nil nil nil
 321    133 160 131 132 nil nil nil 135 138 130 136 nil 141 161 nil nil
 322    nil 164 149 162 147 148 nil 246 nil 151 163 nil 129 nil nil nil]
 323   "Table for converting ISO-8859-1 characters into codepage 860 glyphs.")
 324 (setplist 'cp860-decode-table
 325           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 326
 327 ;; Icelandic
 328 (defvar cp861-decode-table
 329   [
 330    255 173 nil 156 nil nil nil nil nil nil nil 174 170 nil nil nil
 331    nil 241 253 nil nil nil nil 249 nil nil nil 175 172 171 nil 168
 332    nil 164 nil nil 142 143 146 128 nil 144 nil nil nil 165 nil nil
 333    139 nil 159 166 nil nil 153 nil 157 nil 167 nil 154 151 141 nil
 334    133 160 131 nil 132 134 145 135 138 130 136 137 nil 161 nil nil
 335    140 nil nil 162 147 nil 148 246 155 nil 163 150 129 152 149 nil]
 336   "Table for converting ISO-8859-1 characters into codepage 861 glyphs.")
 337 (setplist 'cp861-decode-table
 338           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 339
 340 ;; Hebrew
 341 (defvar cp862-decode-table
 342   ;; Nth element is the code of a cp862 glyph for the multibyte
 343   ;; character created by (make-char 'hebrew-iso8859-8 (+ N 160)).
 344   ;; The element nil means there's no corresponding cp850 glyph.
 345   [
 346    255 173 155 156 nil 157 179 nil nil nil nil 174 170 196 nil nil
 347    248 241 253 nil nil 230 nil 249 nil nil 246 175 172 171 nil nil
 348    nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
 349    nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil 205
 350    128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 351    144 145 146 147 148 149 150 151 152 153 154 nil nil nil nil nil]
 352   "Table for converting ISO-8859-8 characters into codepage 862 glyphs.")
 353 (setplist 'cp862-decode-table
 354           '(charset hebrew-iso8859-8 language "Hebrew" offset 160))
 355
 356 ;; French Canadian
 357 (defvar cp863-decode-table
 358   [
 359    255 nil 155 156 152 nil 160 143 164 nil nil 174 170 nil nil 167
 360    nil 241 253 166 161 nil 134 249 165 nil nil 175 172 171 173 nil
 361    142 nil 132 nil nil nil nil 128 145 144 146 148 nil nil 168 149
 362    nil nil nil nil 153 nil nil nil nil 157 nil 158 154 nil nil nil
 363    133 nil 131 nil nil nil nil 135 138 130 136 137 141 nil 140 139
 364    nil nil nil 162 147 nil nil 246 nil 151 163 150 129 nil nil nil]
 365   "Table for converting ISO-8859-1 characters into codepage 863 glyphs.")
 366 (setplist 'cp863-decode-table
 367           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 368
 369 ;; Arabic
 370 ;; FIXME: Emacs doesn't seem to support the "Arabic" language
 371 ;; environment yet.  So this is only partially usable, for now
 372 (defvar cp864-decode-table
 373   [
 374    255 nil nil nil 164 nil nil nil nil nil nil nil 172 161 nil nil
 375    nil nil nil nil nil nil nil nil nil nil nil 187 nil nil nil 191
 376    nil 193 194 195 196 nil 198 199 169 201 170 171 173 174 175 207
 377    208 209 210 188 189 190 235 215 216 223 238 nil nil nil nil nil
 378    224 247 248 252 251 239 242 243 232 233 253 nil nil nil nil nil
 379    nil 241 nil nil nil nil nil nil nil nil nil nil nil nil nil nil]
 380   "Table for converting ISO-8859-1 characters into codepage 863 glyphs.")
 381 (setplist 'cp864-decode-table
 382           '(charset arabic-iso8859-6 language nil offset 160))
 383
 384 ;; Nordic (Norwegian/Danish)
 385 (defvar cp865-decode-table
 386   [
 387    255 173 nil 156 nil nil nil nil nil nil 166 174 170 nil nil nil
 388    nil 241 253 nil nil nil nil 249 nil nil 167 175 172 171 nil 168
 389    nil nil nil nil 142 143 146 128 nil 144 nil nil nil nil nil nil
 390    nil 165 nil nil nil nil 153 nil 157 nil nil nil 154 nil nil nil
 391    133 160 131 nil 132 134 145 135 138 130 136 137 141 161 140 139
 392    nil 164 149 162 147 nil 148 246 155 151 163 150 129 nil nil 152]
 393   "Table for converting ISO-8859-1 characters into codepage 865 glyphs.")
 394 (setplist 'cp865-decode-table
 395           '(charset latin-iso8859-1 language "Latin-1" offset 160))
 396
 397 ;; Greek (yes, another one!)
 398 (defvar cp869-decode-table
 399   [
 400    255 139 140 156 nil nil 138 245 249 151 nil 174 137 240 nil 142
 401    248 241 153 154 239 247 134 136 141 143 144 175 146 171 149 152
 402    161 164 165 166 167 168 169 170 172 173 181 182 183 184 189 190
 403    198 199 nil 207 208 209 210 211 212 213 145 150 155 157 158 159
 404    252 214 215 216 221 222 224 225 226 227 228 229 230 231 232 233
 405    234 235 237 236 238 242 243 244 246 250 160 251 162 163 253 nil]
 406   "Table for converting ISO-8859-7 characters into codepage 869 glyphs.")
 407 (setplist 'cp869-decode-table
 408           '(charset greek-iso8859-7 language "Greek" offset 160))
 409
 410 ;; Conversion from codepage 775 to Latin-4 for Baltic countries.
 411 (defvar cp775-decode-table
 412   [
 413    255 181 nil 138 150 nil 234 245 166 190 237 149 173 240 207 nil
 414    248 208 nil 139 239 nil 235 nil nil 213 137 133 nil nil 216 nil
 415    160 nil nil nil 142 143 146 189 182 144 183 nil 184 nil nil 161
 416    nil 238 226 232 nil 229 153 158 157 198 nil nil 154 nil 199 225
 417    131 nil nil nil 132 134 145 212 209 130 210 nil 211 nil nil 140
 418    nil 236 147 233 nil 228 148 nil 155 214 nil nil 129 nil 215 nil]
 419   "Table for converting ISO-8859-4 characters into codepage 775 glyphs.")
 420 (setplist 'cp775-decode-table
 421           '(charset latin-iso8859-4 language "Latin-4" offset 160))
 422
 423 ;; Support for the Windows 12xx series of codepages that MS has
 424 ;; butchered from the ISO-8859 specs. This does not add support for
 425 ;; the extended characters that MS has added in the 128 - 159 coding
 426 ;; range, only translates those characters that can be expressed in
 427 ;; the corresponding iso-8859 codepage.
 428
 429 ;; Codepage Mapping:
 430 ;;
 431 ;; Windows-1250: ISO-8859-2 (Central Europe) - differs in some positions
 432 ;; Windows-1251: ISO-8859-5 (Cyrillic)       - differs wildly
 433 ;; Windows-1252: ISO-8859-1 (West Europe)    - exact match
 434 ;; Windows-1253: ISO-8859-7 (Greek)          - differs in some positions
 435 ;; Windows-1254: ISO-8859-9 (Turkish)        - exact match
 436 ;; Windows-1255: ISO-8859-8 (Hebrew)         - exact match
 437 ;; Windows-1256: ISO-8859-6 (Arabic)         - half match
 438 ;; Windows-1257: ISO-8859-4 (Baltic)         - differs, future Latin-7
 439 ;; Windows-1258: VISCII (Vietnamese)         - Completely different
 440
 441 (defvar cp1250-decode-table
 442   [
 443     160 165 162 163 164 188 140 167 168 138 170 141 143 173 142 175
 444     176 185 178 179 180 190 156 161 184 154 186 157 159 189 158 191
 445     192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 446     208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
 447     224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
 448     240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 ]
 449   "ISO-8859-2 to Windows-1250 (Central Europe) codepage decoding table")
 450 (setplist 'cp1250-decode-table
 451           '(charset latin-iso8859-2 language "Latin-2" offset 160))
 452
 453 (defvar cp1251-decode-table
 454   [
 455     160 168 128 129 170 189 178 175 163 138 140 142 141 173 161 143
 456     192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 457     208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
 458     224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
 459     240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
 460     185 184 144 131 186 190 179 191 188 154 156 158 157 167 162 159 ]
 461   "ISO-8859-5 to Windows-1251 (Cyrillic) codepage decoding table")
 462 (setplist 'cp1251-decode-table
 463           '(charset cyrillic-iso8859-5 language "Cyrillic-ISO" offset 160))
 464
 465 ;; cp1253 is missing nbsp so we cannot quite translate perfectly. It
 466 ;; also has two micro/mu characters which would require more complex
 467 ;; processing to accomodate.
 468 (defvar cp1253-decode-table
 469   [
 470     nil 145 146 163 nil nil 166 167 168 169 nil 171 172 173 nil 151
 471     176 177 178 179 180 161 162 183 184 185 186 187 188 189 190 191
 472     192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
 473     208 209 nil 211 212 213 214 215 216 217 218 219 220 221 222 223
 474     224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
 475     240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 nil ]
 476   "ISO-8859-7 to Windows-1253 (Greek) codepage decoding table")
 477 (setplist 'cp1253-decode-table
 478           '(charset greek-iso8859-7 language "Greek" offset 160))
 479
 480 ;; Since Latin-7 is not yet official, and Emacs does not support it,
 481 ;; provide translation between Windows-1257 and Latin-4 the best we
 482 ;; can.
 483 (defvar cp1257-decode-table
 484   [
 485     160 192 nil 170 164 nil 207 167 nil 208 199 204 nil 173 222 nil
 486     176 224 nil 186 nil nil 239 nil nil 240 231 236 nil nil 254 nil
 487     194 nil nil nil 196 197 175 193 200 201 198 nil 203 nil nil 206
 488     nil 210 212 205 nil 213 214 215 168 216 nil nil 220 nil 219 223
 489     226 nil nil nil 228 229 191 225 232 233 230 nil 235 nil nil 238
 490     nil 242 244 237 nil 245 246 247 184 248 nil nil 252 nil 251 nil ]
 491   "ISO-8859-4 to Windows-1257 (Baltic) codepage decoding table")
 492 (setplist 'cp1257-decode-table
 493           '(charset latin-iso8859-4 language "Latin-4" offset 160))
 494
 495 ;;;###autoload
 496 (defun cp-make-coding-systems-for-codepage (codepage iso-name offset)
 497   "Create a coding system to convert IBM CODEPAGE into charset ISO-NAME
 498 whose first character is at offset OFFSET from the beginning of 8-bit
 499 ASCII table.
 500
 501 The created coding system has the usual 3 subsidiary systems: for Unix-,
 502 DOS- and Mac-style EOL conversion.  However, unlike built-in coding
 503 systems, the Mac-style EOL conversion is currently not supported by the
 504 decoder and encoder created by this function."
 505   (let* ((decode-table (intern (format "%s-decode-table" codepage)))
 506          (nonascii-table
 507           (intern (format "%s-nonascii-translation-table" codepage)))
 508          (decode-translation
 509           (intern (format "%s-decode-translation-table" codepage)))
 510          (encode-translation
 511           (intern (format "%s-encode-translation-table" codepage))))
 512     (set nonascii-table
 513          (make-translation-table-from-vector
 514           (cp-decoding-vector-for-codepage
 515            (symbol-value decode-table) iso-name offset)))
 516     (define-translation-table encode-translation
 517       (char-table-extra-slot (symbol-value nonascii-table) 0))
 518     ;; For charsets other than ascii, eight-bit-* and ISO-NAME, set
 519     ;; `?' for one-column charsets, and some Japanese character for
 520     ;; wide-column charsets.  CCL encoder convert that Japanese
 521     ;; character to either dos-unsupported-char-glyph or "??".
 522     (let ((tbl (char-table-extra-slot (symbol-value nonascii-table) 0))
 523           (undef (if (eq system-type 'ms-dos)
 524                      (if dos-unsupported-char-glyph
 525                          (logand dos-unsupported-char-glyph 255)
 526                        127)
 527                    ??))
 528           (charsets (delq 'ascii
 529                           (delq 'eight-bit-control
 530                                 (delq 'eight-bit-graphic
 531                                       (delq iso-name
 532                                             (copy-sequence charset-list))))))
 533           (wide-column-char (make-char 'japanese-jisx0208 32 32)))
 534       (while charsets
 535         (aset tbl (make-char (car charsets))
 536               (if (= (charset-width (car charsets)) 1) undef wide-column-char))
 537         (setq charsets (cdr charsets))))
 538     (define-translation-table decode-translation
 539       (symbol-value nonascii-table))
 540     (cp-coding-system-for-codepage-1
 541      (intern codepage) ?D iso-name decode-translation encode-translation)
 542     ))
 543
 544 (defun cp-codepage-decoder (codepage)
 545   "If CODEPAGE is the name of a supported codepage, return its decode table;
 546 otherwise return nil."
 547   (let ((cp (if (symbolp codepage) (symbol-name codepage) codepage)))
 548     (cond
 549      ((stringp cp)
 550       (intern-soft (format "%s-decode-table" cp)))
 551      (t nil))))
 552
 553 ;;;###autoload
 554 (defun cp-charset-for-codepage (codepage)
 555   "Return the charset for which there is a translation table to DOS CODEPAGE.
 556 CODEPAGE must be the name of a DOS codepage, a string."
 557   (let ((cp-decoder (cp-codepage-decoder codepage)))
 558     (if (null cp-decoder)
 559         (error "Unsupported codepage %s" codepage)
 560       (get cp-decoder 'charset))))
 561
 562 ;;;###autoload
 563 (defun cp-language-for-codepage (codepage)
 564   "Return the name of the MULE language environment for CODEPAGE.
 565 CODEPAGE must be the name of a DOS codepage, a string."
 566   (let ((cp-decoder (cp-codepage-decoder codepage)))
 567     (if (null cp-decoder)
 568         (error "Unsupported codepage %s" codepage)
 569       (get cp-decoder 'language))))
 570
 571 ;;;###autoload
 572 (defun cp-offset-for-codepage (codepage)
 573   "Return the offset to be used in setting up coding systems for CODEPAGE.
 574 CODEPAGE must be the name of a DOS codepage, a string."
 575   (let ((cp-decoder (cp-codepage-decoder codepage)))
 576     (if (null cp-decoder)
 577         (error "Unsupported codepage %s" codepage)
 578       (get cp-decoder 'offset))))
 579
 580 ;;;###autoload
 581 (defun cp-supported-codepages ()
 582   "Return an alist of supported codepages.
 583
 584 Each association in the alist has the form (NNN . CHARSET), where NNN is the
 585 codepage number, and CHARSET is the MULE charset which is the closest match
 586 for the character set supported by that codepage.
 587
 588 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
 589 is a vector, and has a charset property."
 590   (save-match-data
 591     (let (alist chset sname)
 592       (mapatoms
 593        (function
 594         (lambda (sym)
 595           (if (and (boundp sym)
 596                    (string-match "\\`cp\\([1-9][0-9][0-9][0-9]?\\)-decode-table\\'"
 597                                  (setq sname (symbol-name sym)))
 598                    (vectorp (symbol-value sym))
 599                    (setq chset (get sym 'charset)))
 600               (setq alist
 601                     (cons (cons (match-string 1 sname) chset) alist))))))
 602       alist)))
 603
 604 ;;;###autoload
 605 (defun codepage-setup (codepage)
 606   "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
 607
 608 These coding systems are meant for encoding and decoding 8-bit non-ASCII
 609 characters used by the IBM codepages, typically in conjunction with files
 610 read/written by MS-DOS software, or for display on the MS-DOS terminal."
 611   (interactive
 612    (let ((completion-ignore-case t)
 613          (candidates (cp-supported-codepages)))
 614      (list (completing-read "Setup DOS Codepage: (default 437) " candidates
 615                             nil t nil nil "437"))))
 616   (let ((cp (format "cp%s" codepage)))
 617     (cp-make-coding-systems-for-codepage
 618      cp (cp-charset-for-codepage cp) (cp-offset-for-codepage cp))))
 619
 620 (provide 'codepage)
 621
 622 ;; codepage.el ends here