code.delx.au - gnu-emacs/blob - src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  51 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  52 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  53 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  54
  55 /* Declaration of special charsets.  The values are set by
  56    Fsetup_special_charsets.  */
  57 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  58 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  59 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  60 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  61 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  62 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  63 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  64
  65 Lisp_Object Qcharset_table;
  66
  67 /* A char-table containing information of each character set.  */
  68 Lisp_Object Vcharset_table;
  69
  70 /* A vector of charset symbol indexed by charset-id.  This is used
  71    only for returning charset symbol from C functions.  */
  72 Lisp_Object Vcharset_symbol_table;
  73
  74 /* A list of charset symbols ever defined.  */
  75 Lisp_Object Vcharset_list;
  76
  77 /* Vector of translation table ever defined.
  78    ID of a translation table is used to index this vector.  */
  79 Lisp_Object Vtranslation_table_vector;
  80
  81 /* A char-table for characters which may invoke auto-filling.  */
  82 Lisp_Object Vauto_fill_chars;
  83
  84 Lisp_Object Qauto_fill_chars;
  85
  86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  87 int bytes_by_char_head[256];
  88 int width_by_char_head[256];
  89
  90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  91    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  92 int iso_charset_table[2][2][128];
  93
  94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  95 unsigned char *_fetch_multibyte_char_p;
  96 int _fetch_multibyte_char_len;
  97
  98 /* Offset to add to a non-ASCII value when inserting it.  */
  99 int nonascii_insert_offset;
 100
 101 /* Translation table for converting non-ASCII unibyte characters
 102    to multibyte codes, or nil.  */
 103 Lisp_Object Vnonascii_translation_table;
 104
 105 /* List of all possible generic characters.  */
 106 Lisp_Object Vgeneric_character_list;
 107
 108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse string STR of length LENGTH and fetch information of a
 119    character at STR.  Set BYTES to the byte length the character
 120    occupies, CHARSET, C1, C2 to proper values of the character. */
 121
 122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)             \
 123   do {                                                                       \
 124     (c1) = *(str);                                                           \
 125     (bytes) = BYTES_BY_CHAR_HEAD (c1);                                       \
 126     if ((bytes) == 1)                                                        \
 127       (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
 128     else if ((bytes) == 2)                                                   \
 129       {                                                                      \
 130         if ((c1) == LEADING_CODE_8_BIT_CONTROL)                              \
 131           (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20;         \
 132         else                                                                 \
 133           (charset) = (c1), (c1) = (str)[1] & 0x7F;                          \
 134       }                                                                      \
 135     else if ((bytes) == 3)                                                   \
 136       {                                                                      \
 137         if ((c1) < LEADING_CODE_PRIVATE_11)                                  \
 138           (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F;  \
 139         else                                                                 \
 140           (charset) = (str)[1], (c1) = (str)[2] & 0x7F;                      \
 141       }                                                                      \
 142     else                                                                     \
 143       (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F;  \
 144   } while (0)
 145
 146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 147 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 148   ((charset) == CHARSET_ASCII                           \
 149    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 150    : ((charset) == CHARSET_8_BIT_CONTROL                \
 151       ? ((c1) >= 0x80 && (c1) <= 0x9F)                  \
 152       : ((charset) == CHARSET_8_BIT_GRAPHIC             \
 153          ? ((c1) >= 0x80 && (c1) <= 0xFF)               \
 154          : (CHARSET_DIMENSION (charset) == 1            \
 155             ? ((c1) >= 0x20 && (c1) <= 0x7F)            \
 156             : ((c1) >= 0x20 && (c1) <= 0x7F             \
 157                && (c2) >= 0x20 && (c2) <= 0x7F)))))
 158
 159 /* Store multi-byte form of the character C in STR.  The caller should
 160    allocate at least 4-byte area at STR in advance.  Returns the
 161    length of the multi-byte form.  If C is an invalid character code,
 162    signal an error.
 163
 164    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 165    directly if C can be an ASCII character.  */
 166
 167 int
 168 char_to_string (c, str)
 169      int c;
 170      unsigned char *str;
 171 {
 172   unsigned char *p = str;
 173
 174   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 175     {
 176       /* Multibyte character can't have a modifier bit.  */
 177       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 178         invalid_character (c);
 179
 180       /* For Meta, Shift, and Control modifiers, we need special care.  */
 181       if (c & CHAR_META)
 182         {
 183           /* Move the meta bit to the right place for a string.  */
 184           c = (c & ~CHAR_META) | 0x80;
 185         }
 186       if (c & CHAR_SHIFT)
 187         {
 188           /* Shift modifier is valid only with [A-Za-z].  */
 189           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 190             c &= ~CHAR_SHIFT;
 191           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 192             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 193         }
 194       if (c & CHAR_CTL)
 195         {
 196           /* Simulate the code in lread.c.  */
 197           /* Allow `\C- ' and `\C-?'.  */
 198           if (c == (CHAR_CTL | ' '))
 199             c = 0;
 200           else if (c == (CHAR_CTL | '?'))
 201             c = 127;
 202           /* ASCII control chars are made from letters (both cases),
 203              as well as the non-letters within 0100...0137.  */
 204           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 205             c &= (037 | (~0177 & ~CHAR_CTL));
 206           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 207             c &= (037 | (~0177 & ~CHAR_CTL));
 208         }
 209
 210       /* If C still has any modifier bits, it is an invalid character.  */
 211       if (c & CHAR_MODIFIER_MASK)
 212         invalid_character (c);
 213     }
 214   if (SINGLE_BYTE_CHAR_P (c))
 215     {
 216       if (ASCII_BYTE_P (c) || c >= 0xA0)
 217         *p++ = c;
 218       else
 219         {
 220           *p++ = LEADING_CODE_8_BIT_CONTROL;
 221           *p++ = c + 0x20;
 222         }
 223     }
 224   else if (c < MAX_CHAR)
 225     {
 226       int charset, c1, c2;
 227
 228       SPLIT_CHAR (c, charset, c1, c2);
 229
 230       if (charset >= LEADING_CODE_EXT_11)
 231         *p++ = (charset < LEADING_CODE_EXT_12
 232                 ? LEADING_CODE_PRIVATE_11
 233                 : (charset < LEADING_CODE_EXT_21
 234                    ? LEADING_CODE_PRIVATE_12
 235                    : (charset < LEADING_CODE_EXT_22
 236                       ? LEADING_CODE_PRIVATE_21
 237                       : LEADING_CODE_PRIVATE_22)));
 238       *p++ = charset;
 239       if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 240         invalid_character (c);
 241       if (c1)
 242         {
 243           *p++ = c1 | 0x80;
 244           if (c2 > 0)
 245             *p++ = c2 | 0x80;
 246         }
 247     }
 248   else
 249     invalid_character (c);
 250
 251   return (p - str);
 252 }
 253
 254 /* Return the non-ASCII character corresponding to multi-byte form at
 255    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 256    length of the multibyte form in *ACTUAL_LEN.
 257
 258    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 259    this function directly if you want ot handle ASCII characters as
 260    well.  */
 261
 262 int
 263 string_to_char (str, len, actual_len)
 264      const unsigned char *str;
 265      int len, *actual_len;
 266 {
 267   int c, bytes, charset, c1, c2;
 268
 269   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 270   c = MAKE_CHAR (charset, c1, c2);
 271   if (actual_len)
 272     *actual_len = bytes;
 273   return c;
 274 }
 275
 276 /* Return the length of the multi-byte form at string STR of length LEN.
 277    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 278 int
 279 multibyte_form_length (str, len)
 280      const unsigned char *str;
 281      int len;
 282 {
 283   int bytes;
 284
 285   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 286   return bytes;
 287 }
 288
 289 /* Check multibyte form at string STR of length LEN and set variables
 290    pointed by CHARSET, C1, and C2 to charset and position codes of the
 291    character at STR, and return 0.  If there's no multibyte character,
 292    return -1.  This should be used only in the macro SPLIT_STRING
 293    which checks range of STR in advance.  */
 294
 295 int
 296 split_string (str, len, charset, c1, c2)
 297      const unsigned char *str;
 298      unsigned char *c1, *c2;
 299      int len, *charset;
 300 {
 301   register int bytes, cs, code1, code2 = -1;
 302
 303   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 304   if (cs == CHARSET_ASCII)
 305     return -1;
 306   *charset = cs;
 307   *c1 = code1;
 308   *c2 = code2;
 309   return 0;
 310 }
 311
 312 /* Return 1 iff character C has valid printable glyph.
 313    Use the macro CHAR_PRINTABLE_P instead.  */
 314 int
 315 char_printable_p (c)
 316      int c;
 317 {
 318   int charset, c1, c2, chars;
 319
 320   if (ASCII_BYTE_P (c))
 321     return 1;
 322   else if (SINGLE_BYTE_CHAR_P (c))
 323     return 0;
 324   else if (c >= MAX_CHAR)
 325     return 0;
 326
 327   SPLIT_CHAR (c, charset, c1, c2);
 328   if (! CHARSET_DEFINED_P (charset))
 329     return 0;
 330   if (CHARSET_CHARS (charset) == 94
 331       ? c1 <= 32 || c1 >= 127
 332       : c1 < 32)
 333     return 0;
 334   if (CHARSET_DIMENSION (charset) == 2
 335       && (CHARSET_CHARS (charset) == 94
 336           ? c2 <= 32 || c2 >= 127
 337           : c2 < 32))
 338     return 0;
 339   return 1;
 340 }
 341
 342 /* Translate character C by translation table TABLE.  If C
 343    is negative, translate a character specified by CHARSET, C1, and C2
 344    (C1 and C2 are code points of the character).  If no translation is
 345    found in TABLE, return C.  */
 346 int
 347 translate_char (table, c, charset, c1, c2)
 348      Lisp_Object table;
 349      int c, charset, c1, c2;
 350 {
 351   Lisp_Object ch;
 352   int alt_charset, alt_c1, alt_c2, dimension;
 353
 354   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 355   if (!CHAR_TABLE_P (table)
 356       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 357     return c;
 358
 359   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 360   dimension = CHARSET_DIMENSION (alt_charset);
 361   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 362     /* CH is not a generic character, just return it.  */
 363     return XFASTINT (ch);
 364
 365   /* Since CH is a generic character, we must return a specific
 366      charater which has the same position codes as C from CH.  */
 367   if (charset < 0)
 368     SPLIT_CHAR (c, charset, c1, c2);
 369   if (dimension != CHARSET_DIMENSION (charset))
 370     /* We can't make such a character because of dimension mismatch.  */
 371     return c;
 372   return MAKE_CHAR (alt_charset, c1, c2);
 373 }
 374
 375 /* Convert the unibyte character C to multibyte based on
 376    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 377    convert C to a valid multibyte character, convert it based on
 378    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 379
 380 int
 381 unibyte_char_to_multibyte (c)
 382      int c;
 383 {
 384   if (c < 0400 && c >= 0200)
 385     {
 386       int c_save = c;
 387
 388       if (! NILP (Vnonascii_translation_table))
 389         {
 390           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 391           if (c >= 0400 && ! char_valid_p (c, 0))
 392             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 393         }
 394       else if (c >= 0240 && nonascii_insert_offset > 0)
 395         {
 396           c += nonascii_insert_offset;
 397           if (c < 0400 || ! char_valid_p (c, 0))
 398             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 399         }
 400       else if (c >= 0240)
 401         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 402     }
 403   return c;
 404 }
 405
 406
 407 /* Convert the multibyte character C to unibyte 8-bit character based
 408    on Vnonascii_translation_table or nonascii_insert_offset.  If
 409    REV_TBL is non-nil, it should be a reverse table of
 410    Vnonascii_translation_table, i.e. what given by:
 411      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 412
 413 int
 414 multibyte_char_to_unibyte (c, rev_tbl)
 415      int c;
 416      Lisp_Object rev_tbl;
 417 {
 418   if (!SINGLE_BYTE_CHAR_P (c))
 419     {
 420       int c_save = c;
 421
 422       if (! CHAR_TABLE_P (rev_tbl)
 423           && CHAR_TABLE_P (Vnonascii_translation_table))
 424         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 425                                           make_number (0));
 426       if (CHAR_TABLE_P (rev_tbl))
 427         {
 428           Lisp_Object temp;
 429           temp = Faref (rev_tbl, make_number (c));
 430           if (INTEGERP (temp))
 431             c = XINT (temp);
 432           if (c >= 256)
 433             c = (c_save & 0177) + 0200;
 434         }
 435       else
 436         {
 437           if (nonascii_insert_offset > 0)
 438             c -= nonascii_insert_offset;
 439           if (c < 128 || c >= 256)
 440             c = (c_save & 0177) + 0200;
 441         }
 442     }
 443
 444   return c;
 445 }
 446
 447 \f
 448 /* Update the table Vcharset_table with the given arguments (see the
 449    document of `define-charset' for the meaning of each argument).
 450    Several other table contents are also updated.  The caller should
 451    check the validity of CHARSET-ID and the remaining arguments in
 452    advance.  */
 453
 454 void
 455 update_charset_table (charset_id, dimension, chars, width, direction,
 456                       iso_final_char, iso_graphic_plane,
 457                       short_name, long_name, description)
 458      Lisp_Object charset_id, dimension, chars, width, direction;
 459      Lisp_Object iso_final_char, iso_graphic_plane;
 460      Lisp_Object short_name, long_name, description;
 461 {
 462   int charset = XINT (charset_id);
 463   int bytes;
 464   unsigned char leading_code_base, leading_code_ext;
 465
 466   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 467     CHARSET_TABLE_ENTRY (charset)
 468       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 469
 470   if (NILP (long_name))
 471     long_name = short_name;
 472   if (NILP (description))
 473     description = long_name;
 474
 475   /* Get byte length of multibyte form, base leading-code, and
 476      extended leading-code of the charset.  See the comment under the
 477      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 478   bytes = XINT (dimension);
 479   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 480     {
 481       /* Official charset, it doesn't have an extended leading-code.  */
 482       if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
 483         bytes += 1; /* For a base leading-code.  */
 484       leading_code_base = charset;
 485       leading_code_ext = 0;
 486     }
 487   else
 488     {
 489       /* Private charset.  */
 490       bytes += 2; /* For base and extended leading-codes.  */
 491       leading_code_base
 492         = (charset < LEADING_CODE_EXT_12
 493            ? LEADING_CODE_PRIVATE_11
 494            : (charset < LEADING_CODE_EXT_21
 495               ? LEADING_CODE_PRIVATE_12
 496               : (charset < LEADING_CODE_EXT_22
 497                  ? LEADING_CODE_PRIVATE_21
 498                  : LEADING_CODE_PRIVATE_22)));
 499       leading_code_ext = charset;
 500     }
 501
 502   if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
 503       &&BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 504     error ("Invalid dimension for the charset-ID %d", charset);
 505
 506   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 507   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 508   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 509   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 510   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 511   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 512   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 513     = make_number (leading_code_base);
 514   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 515     = make_number (leading_code_ext);
 516   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 517   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 518     = iso_graphic_plane;
 519   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 520   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 521   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 522   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 523
 524   {
 525     /* If we have already defined a charset which has the same
 526        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 527        DIRECTION, we must update the entry REVERSE-CHARSET of both
 528        charsets.  If there's no such charset, the value of the entry
 529        is set to nil.  */
 530     int i;
 531
 532     for (i = 0; i <= MAX_CHARSET; i++)
 533       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 534         {
 535           if (CHARSET_DIMENSION (i) == XINT (dimension)
 536               && CHARSET_CHARS (i) == XINT (chars)
 537               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 538               && CHARSET_DIRECTION (i) != XINT (direction))
 539             {
 540               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 541                 = make_number (i);
 542               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 543               break;
 544             }
 545         }
 546     if (i > MAX_CHARSET)
 547       /* No such a charset.  */
 548       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 549         = make_number (-1);
 550   }
 551
 552   if (charset != CHARSET_ASCII
 553       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 554     {
 555       width_by_char_head[leading_code_base] = XINT (width);
 556
 557       /* Update table emacs_code_class.  */
 558       emacs_code_class[charset] = (bytes == 2
 559                                    ? EMACS_leading_code_2
 560                                    : (bytes == 3
 561                                       ? EMACS_leading_code_3
 562                                       : EMACS_leading_code_4));
 563     }
 564
 565   /* Update table iso_charset_table.  */
 566   if (iso_final_char >= 0
 567       && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 568     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 569 }
 570
 571 #ifdef emacs
 572
 573 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 574    is invalid.  */
 575 int
 576 get_charset_id (charset_symbol)
 577      Lisp_Object charset_symbol;
 578 {
 579   Lisp_Object val;
 580   int charset;
 581
 582   return ((SYMBOLP (charset_symbol)
 583            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 584            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 585                CHARSET_VALID_P (charset)))
 586           ? charset : -1);
 587 }
 588
 589 /* Return an identification number for a new private charset of
 590    DIMENSION and WIDTH.  If there's no more room for the new charset,
 591    return 0.  */
 592 Lisp_Object
 593 get_new_private_charset_id (dimension, width)
 594      int dimension, width;
 595 {
 596   int charset, from, to;
 597
 598   if (dimension == 1)
 599     {
 600       if (width == 1)
 601         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 602       else
 603         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 604     }
 605   else
 606     {
 607       if (width == 1)
 608         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 609       else
 610         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 611     }
 612
 613   for (charset = from; charset < to; charset++)
 614     if (!CHARSET_DEFINED_P (charset)) break;
 615
 616   return make_number (charset < to ? charset : 0);
 617 }
 618
 619 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 620   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 621 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 622  treated as a private charset.\n\
 623 INFO-VECTOR is a vector of the format:\n\
 624    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 625     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 626 The meanings of each elements is as follows:\n\
 627 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 628 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 629 WIDTH (integer) is the number of columns a character in the charset\n\
 630 occupies on the screen: one of 0, 1, and 2.\n\
 631 \n\
 632 DIRECTION (integer) is the rendering direction of characters in the\n\
 633 charset when rendering.  If 0, render from left to right, else\n\
 634 render from right to left.\n\
 635 \n\
 636 ISO-FINAL-CHAR (character) is the final character of the\n\
 637 corresponding ISO 2022 charset.\n\
 638 It may be -1 if the charset is internal use only.\n\
 639 \n\
 640 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 641 while encoding to variants of ISO 2022 coding system, one of the\n\
 642 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 643 It may be -1 if the charset is internal use only.\n\
 644 \n\
 645 SHORT-NAME (string) is the short name to refer to the charset.\n\
 646 \n\
 647 LONG-NAME (string) is the long name to refer to the charset.\n\
 648 \n\
 649 DESCRIPTION (string) is the description string of the charset.")
 650   (charset_id, charset_symbol, info_vector)
 651      Lisp_Object charset_id, charset_symbol, info_vector;
 652 {
 653   Lisp_Object *vec;
 654
 655   if (!NILP (charset_id))
 656     CHECK_NUMBER (charset_id, 0);
 657   CHECK_SYMBOL (charset_symbol, 1);
 658   CHECK_VECTOR (info_vector, 2);
 659
 660   if (! NILP (charset_id))
 661     {
 662       if (! CHARSET_VALID_P (XINT (charset_id)))
 663         error ("Invalid CHARSET: %d", XINT (charset_id));
 664       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 665         error ("Already defined charset: %d", XINT (charset_id));
 666     }
 667
 668   vec = XVECTOR (info_vector)->contents;
 669   if (XVECTOR (info_vector)->size != 9
 670       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 671       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 672       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 673       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 674       || !INTEGERP (vec[4])
 675       || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 676       || !INTEGERP (vec[5])
 677       || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 678       || !STRINGP (vec[6])
 679       || !STRINGP (vec[7])
 680       || !STRINGP (vec[8]))
 681     error ("Invalid info-vector argument for defining charset %s",
 682            XSYMBOL (charset_symbol)->name->data);
 683
 684   if (NILP (charset_id))
 685     {
 686       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 687       if (XINT (charset_id) == 0)
 688         error ("There's no room for a new private charset %s",
 689                XSYMBOL (charset_symbol)->name->data);
 690     }
 691
 692   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 693                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 694   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 695   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 696   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 697   return Qnil;
 698 }
 699
 700 DEFUN ("generic-character-list", Fgeneric_character_list,
 701        Sgeneric_character_list, 0, 0, 0,
 702   "Return a list of all possible generic characters.\n\
 703 It includes a generic character for a charset not yet defined.")
 704   ()
 705 {
 706   return Vgeneric_character_list;
 707 }
 708
 709 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 710        Sget_unused_iso_final_char, 2, 2, 0,
 711   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 712 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 713 CHARS is the number of characters in a dimension: 94 or 96.\n\
 714 \n\
 715 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 716 If there's no unused final char for the specified kind of charset,\n\
 717 return nil.")
 718   (dimension, chars)
 719      Lisp_Object dimension, chars;
 720 {
 721   int final_char;
 722
 723   CHECK_NUMBER (dimension, 0);
 724   CHECK_NUMBER (chars, 1);
 725   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 726     error ("Invalid charset dimension %d, it should be 1 or 2",
 727            XINT (dimension));
 728   if (XINT (chars) != 94 && XINT (chars) != 96)
 729     error ("Invalid charset chars %d, it should be 94 or 96",
 730            XINT (chars));
 731   for (final_char = '0'; final_char <= '?'; final_char++)
 732     {
 733       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 734         break;
 735     }
 736   return (final_char <= '?' ? make_number (final_char) : Qnil);
 737 }
 738
 739 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 740        4, 4, 0,
 741   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 742 CHARSET should be defined by `defined-charset' in advance.")
 743   (dimension, chars, final_char, charset_symbol)
 744      Lisp_Object dimension, chars, final_char, charset_symbol;
 745 {
 746   int charset;
 747
 748   CHECK_NUMBER (dimension, 0);
 749   CHECK_NUMBER (chars, 1);
 750   CHECK_NUMBER (final_char, 2);
 751   CHECK_SYMBOL (charset_symbol, 3);
 752
 753   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 754     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 755   if (XINT (chars) != 94 && XINT (chars) != 96)
 756     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 757   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 758     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 759   if ((charset = get_charset_id (charset_symbol)) < 0)
 760     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 761
 762   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 763   return Qnil;
 764 }
 765
 766 /* Return information about charsets in the text at PTR of NBYTES
 767    bytes, which are NCHARS characters.  The value is:
 768
 769         0: Each character is represented by one byte.  This is always
 770            true for unibyte text.
 771         1: No charsets other than ascii eight-bit-control,
 772            eight-bit-graphic, and latin-1 are found.
 773         2: Otherwise.
 774
 775    In addition, if CHARSETS is nonzero, for each found charset N, set
 776    CHARSETS[N] to 1.  For that, callers should allocate CHARSETS
 777    (MAX_CHARSET + 1 elements) in advance.  It may lookup a translation
 778    table TABLE if supplied.  For invalid charsets, set CHARSETS[1] to
 779    1 (note that there's no charset whose ID is 1).  */
 780
 781 int
 782 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
 783      unsigned char *ptr;
 784      int nchars, nbytes, *charsets;
 785      Lisp_Object table;
 786 {
 787   if (nchars == nbytes)
 788     {
 789       if (charsets && nbytes > 0)
 790         {
 791           unsigned char *endp = ptr + nbytes;
 792           int maskbits = 0;
 793
 794           while (ptr < endp && maskbits != 7)
 795             {
 796               maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
 797               ptr++;
 798             }
 799
 800           if (maskbits & 1)
 801             charsets[CHARSET_ASCII] = 1;
 802           if (maskbits & 2)
 803             charsets[CHARSET_8_BIT_CONTROL] = 1;
 804           if (maskbits & 4)
 805             charsets[CHARSET_8_BIT_GRAPHIC] = 1;
 806         }
 807       return 0;
 808     }
 809   else
 810     {
 811       int return_val = 1;
 812       int bytes, charset, c1, c2;
 813
 814       if (! CHAR_TABLE_P (table))
 815         table = Qnil;
 816
 817       while (nchars-- > 0)
 818         {
 819           SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
 820           ptr += bytes;
 821
 822           if (!CHARSET_DEFINED_P (charset))
 823             charset = 1;
 824           else if (! NILP (table))
 825             {
 826               int c = translate_char (table, -1, charset, c1, c2);
 827               if (c >= 0)
 828                 charset = CHAR_CHARSET (c);
 829             }
 830
 831           if (return_val == 1
 832               && charset != CHARSET_ASCII
 833               && charset != CHARSET_8_BIT_CONTROL
 834               && charset != CHARSET_8_BIT_GRAPHIC
 835               && charset != charset_latin_iso8859_1)
 836             return_val = 2;
 837
 838           if (charsets)
 839             charsets[charset] = 1;
 840           else if (return_val == 2)
 841             break;
 842         }
 843       return return_val;
 844     }
 845 }
 846
 847 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 848        2, 3, 0,
 849   "Return a list of charsets in the region between BEG and END.\n\
 850 BEG and END are buffer positions.\n\
 851 Optional arg TABLE if non-nil is a translation table to look up.\n\
 852 \n\
 853 If the region contains invalid multiybte characters,\n\
 854 `unknown' is included in the returned list.\n\
 855 \n\
 856 If the current buffer is unibyte, the returned list may contain\n\
 857 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 858   (beg, end, table)
 859      Lisp_Object beg, end, table;
 860 {
 861   int charsets[MAX_CHARSET + 1];
 862   int from, from_byte, to, stop, stop_byte, i;
 863   Lisp_Object val;
 864
 865   validate_region (&beg, &end);
 866   from = XFASTINT (beg);
 867   stop = to = XFASTINT (end);
 868
 869   if (from < GPT && GPT < to)
 870     {
 871       stop = GPT;
 872       stop_byte = GPT_BYTE;
 873     }
 874   else
 875     stop_byte = CHAR_TO_BYTE (stop);
 876
 877   from_byte = CHAR_TO_BYTE (from);
 878
 879   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 880   while (1)
 881     {
 882       find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
 883                             stop_byte - from_byte, charsets, table);
 884       if (stop < to)
 885         {
 886           from = stop, from_byte = stop_byte;
 887           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 888         }
 889       else
 890         break;
 891     }
 892
 893   val = Qnil;
 894   if (charsets[1])
 895     val = Fcons (Qunknown, val);
 896   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 897     if (charsets[i])
 898       val = Fcons (CHARSET_SYMBOL (i), val);
 899   if (charsets[0])
 900     val = Fcons (Qascii, val);
 901   return val;
 902 }
 903
 904 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 905        1, 2, 0,
 906   "Return a list of charsets in STR.\n\
 907 Optional arg TABLE if non-nil is a translation table to look up.\n\
 908 \n\
 909 If the region contains invalid multiybte characters,\n\
 910 `unknown' is included in the returned list.\n\
 911 \n\
 912 If STR is unibyte, the returned list may contain\n\
 913 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 914   (str, table)
 915      Lisp_Object str, table;
 916 {
 917   int charsets[MAX_CHARSET + 1];
 918   int i;
 919   Lisp_Object val;
 920
 921   CHECK_STRING (str, 0);
 922
 923   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 924   find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
 925                         STRING_BYTES (XSTRING (str)), charsets, table);
 926
 927   val = Qnil;
 928   if (charsets[1])
 929     val = Fcons (Qunknown, val);
 930   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 931     if (charsets[i])
 932       val = Fcons (CHARSET_SYMBOL (i), val);
 933   if (charsets[0])
 934     val = Fcons (Qascii, val);
 935   return val;
 936 }
 937
 938 \f
 939 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 940   "")
 941   (charset, code1, code2)
 942      Lisp_Object charset, code1, code2;
 943 {
 944   int charset_id, c1, c2;
 945
 946   CHECK_NUMBER (charset, 0);
 947   charset_id = XINT (charset);
 948   if (!CHARSET_DEFINED_P (charset_id))
 949     error ("Invalid charset ID: %d", XINT (charset));
 950
 951   if (NILP (code1))
 952     c1 = 0;
 953   else
 954     {
 955       CHECK_NUMBER (code1, 1);
 956       c1 = XINT (code1);
 957     }
 958   if (NILP (code2))
 959     c2 = 0;
 960   else
 961     {
 962       CHECK_NUMBER (code2, 2);
 963       c2 = XINT (code2);
 964     }
 965
 966   if (charset_id == CHARSET_ASCII)
 967     {
 968       if (c1 < 0 || c1 > 0x7F)
 969         goto invalid_code_posints;
 970       return make_number (c1);
 971     }
 972   else if (charset_id == CHARSET_8_BIT_CONTROL)
 973     {
 974       if (c1 < 0x80 || c1 > 0x9F)
 975         goto invalid_code_posints;
 976       return make_number (c1);
 977     }
 978   else if (charset_id == CHARSET_8_BIT_GRAPHIC)
 979     {
 980       if (c1 < 0xA0 || c1 > 0xFF)
 981         goto invalid_code_posints;
 982       return make_number (c1);
 983     }
 984   else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
 985     goto invalid_code_posints;
 986   c1 &= 0x7F;
 987   c2 &= 0x7F;
 988   if (c1 == 0
 989       ? c2 != 0
 990       : (c2 == 0
 991          ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
 992          : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
 993     goto invalid_code_posints;
 994   return make_number (MAKE_CHAR (charset_id, c1, c2));
 995
 996  invalid_code_posints:
 997   error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 998 }
 999
1000 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1001   "Return list of charset and one or two position-codes of CHAR.\n\
1002 If CHAR is invalid as a character code,\n\
1003 return a list of symbol `unknown' and CHAR.")
1004   (ch)
1005      Lisp_Object ch;
1006 {
1007   Lisp_Object val;
1008   int c, charset, c1, c2;
1009
1010   CHECK_NUMBER (ch, 0);
1011   c = XFASTINT (ch);
1012   if (!CHAR_VALID_P (c, 1))
1013     return Fcons (Qunknown, Fcons (ch, Qnil));
1014   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1015   return (c2 >= 0
1016           ? Fcons (CHARSET_SYMBOL (charset),
1017                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1018           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1019 }
1020
1021 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1022   "Return charset of CHAR.")
1023   (ch)
1024      Lisp_Object ch;
1025 {
1026   CHECK_NUMBER (ch, 0);
1027
1028   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1029 }
1030
1031 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1032   "Return charset of a character in the current buffer at position POS.\n\
1033 If POS is nil, it defauls to the current point.\n\
1034 If POS is out of range, the value is nil.")
1035   (pos)
1036      Lisp_Object pos;
1037 {
1038   Lisp_Object ch;
1039   int charset;
1040
1041   ch = Fchar_after (pos);
1042   if (! INTEGERP (ch))
1043     return ch;
1044   charset = CHAR_CHARSET (XINT (ch));
1045   return CHARSET_SYMBOL (charset);
1046 }
1047
1048 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1049   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1050 \n\
1051 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1052 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1053 where as Emacs distinguishes them by charset symbol.\n\
1054 See the documentation of the function `charset-info' for the meanings of\n\
1055 DIMENSION, CHARS, and FINAL-CHAR.")
1056   (dimension, chars, final_char)
1057      Lisp_Object dimension, chars, final_char;
1058 {
1059   int charset;
1060
1061   CHECK_NUMBER (dimension, 0);
1062   CHECK_NUMBER (chars, 1);
1063   CHECK_NUMBER (final_char, 2);
1064
1065   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1066     return Qnil;
1067   return CHARSET_SYMBOL (charset);
1068 }
1069
1070 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1071    generic character.  If GENERICP is zero, return nonzero iff C is a
1072    valid normal character.  Do not call this function directly,
1073    instead use macro CHAR_VALID_P.  */
1074 int
1075 char_valid_p (c, genericp)
1076      int c, genericp;
1077 {
1078   int charset, c1, c2;
1079
1080   if (c < 0)
1081     return 0;
1082   if (SINGLE_BYTE_CHAR_P (c))
1083     return 1;
1084   SPLIT_CHAR (c, charset, c1, c2);
1085   if (genericp)
1086     {
1087       if (c1)
1088         {
1089           if (c2 <= 0) c2 = 0x20;
1090         }
1091       else
1092         {
1093           if (c2 <= 0) c1 = c2 = 0x20;
1094         }
1095     }
1096   return (CHARSET_DEFINED_P (charset)
1097           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1098 }
1099
1100 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1101   "Return t if OBJECT is a valid normal character.\n\
1102 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1103 a valid generic character.")
1104   (object, genericp)
1105      Lisp_Object object, genericp;
1106 {
1107   if (! NATNUMP (object))
1108     return Qnil;
1109   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1110 }
1111
1112 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1113        Sunibyte_char_to_multibyte, 1, 1, 0,
1114   "Convert the unibyte character CH to multibyte character.\n\
1115 The conversion is done based on `nonascii-translation-table' (which see)\n\
1116  or `nonascii-insert-offset' (which see).")
1117   (ch)
1118      Lisp_Object ch;
1119 {
1120   int c;
1121
1122   CHECK_NUMBER (ch, 0);
1123   c = XINT (ch);
1124   if (c < 0 || c >= 0400)
1125     error ("Invalid unibyte character: %d", c);
1126   c = unibyte_char_to_multibyte (c);
1127   if (c < 0)
1128     error ("Can't convert to multibyte character: %d", XINT (ch));
1129   return make_number (c);
1130 }
1131
1132 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1133        Smultibyte_char_to_unibyte, 1, 1, 0,
1134   "Convert the multibyte character CH to unibyte character.\n\
1135 The conversion is done based on `nonascii-translation-table' (which see)\n\
1136  or `nonascii-insert-offset' (which see).")
1137   (ch)
1138      Lisp_Object ch;
1139 {
1140   int c;
1141
1142   CHECK_NUMBER (ch, 0);
1143   c = XINT (ch);
1144   if (! CHAR_VALID_P (c, 0))
1145     error ("Invalid multibyte character: %d", c);
1146   c = multibyte_char_to_unibyte (c, Qnil);
1147   if (c < 0)
1148     error ("Can't convert to unibyte character: %d", XINT (ch));
1149   return make_number (c);
1150 }
1151
1152 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1153   "Return 1 regardless of the argument CHAR.\n\
1154 This is now an obsolete function.  We keep it just for backward compatibility.")
1155   (ch)
1156      Lisp_Object ch;
1157 {
1158   Lisp_Object val;
1159
1160   CHECK_NUMBER (ch, 0);
1161   return make_number (1);
1162 }
1163
1164 /* Return how many bytes C will occupy in a multibyte buffer.
1165    Don't call this function directly, instead use macro CHAR_BYTES.  */
1166 int
1167 char_bytes (c)
1168      int c;
1169 {
1170   int charset;
1171
1172   if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1173     return 1;
1174   if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1175     return 1;
1176
1177   charset = CHAR_CHARSET (c);
1178   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1179 }
1180
1181 /* Return the width of character of which multi-byte form starts with
1182    C.  The width is measured by how many columns occupied on the
1183    screen when displayed in the current buffer.  */
1184
1185 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1186   (c < 0x20                                                             \
1187    ? (c == '\t'                                                         \
1188       ? XFASTINT (current_buffer->tab_width)                            \
1189       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1190    : (c < 0x7f                                                          \
1191       ? 1                                                               \
1192       : (c == 0x7F                                                      \
1193          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1194          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1195              && BASE_LEADING_CODE_P (c))                                \
1196             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1197             : 4))))
1198
1199 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1200   "Return width of CHAR when displayed in the current buffer.\n\
1201 The width is measured by how many columns it occupies on the screen.")
1202   (ch)
1203        Lisp_Object ch;
1204 {
1205   Lisp_Object val, disp;
1206   int c;
1207   struct Lisp_Char_Table *dp = buffer_display_table ();
1208
1209   CHECK_NUMBER (ch, 0);
1210
1211   c = XINT (ch);
1212
1213   /* Get the way the display table would display it.  */
1214   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1215
1216   if (VECTORP (disp))
1217     XSETINT (val, XVECTOR (disp)->size);
1218   else if (SINGLE_BYTE_CHAR_P (c))
1219     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1220   else
1221     {
1222       int charset = CHAR_CHARSET (c);
1223
1224       XSETFASTINT (val, CHARSET_WIDTH (charset));
1225     }
1226   return val;
1227 }
1228
1229 /* Return width of string STR of length LEN when displayed in the
1230    current buffer.  The width is measured by how many columns it
1231    occupies on the screen.  */
1232
1233 int
1234 strwidth (str, len)
1235      unsigned char *str;
1236      int len;
1237 {
1238   unsigned char *endp = str + len;
1239   int width = 0;
1240   struct Lisp_Char_Table *dp = buffer_display_table ();
1241
1242   while (str < endp)
1243     {
1244       Lisp_Object disp;
1245       int thislen;
1246       int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1247
1248       /* Get the way the display table would display it.  */
1249       if (dp)
1250         disp = DISP_CHAR_VECTOR (dp, c);
1251       else
1252         disp = Qnil;
1253
1254       if (VECTORP (disp))
1255         width += XVECTOR (disp)->size;
1256       else
1257         width += ONE_BYTE_CHAR_WIDTH (*str);
1258
1259       str += thislen;
1260     }
1261   return width;
1262 }
1263
1264 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1265   "Return width of STRING when displayed in the current buffer.\n\
1266 Width is measured by how many columns it occupies on the screen.\n\
1267 When calculating width of a multibyte character in STRING,\n\
1268 only the base leading-code is considered; the validity of\n\
1269 the following bytes is not checked.")
1270   (str)
1271      Lisp_Object str;
1272 {
1273   Lisp_Object val;
1274
1275   CHECK_STRING (str, 0);
1276   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1277                               STRING_BYTES (XSTRING (str))));
1278   return val;
1279 }
1280
1281 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1282   "Return the direction of CHAR.\n\
1283 The returned value is 0 for left-to-right and 1 for right-to-left.")
1284   (ch)
1285      Lisp_Object ch;
1286 {
1287   int charset;
1288
1289   CHECK_NUMBER (ch, 0);
1290   charset = CHAR_CHARSET (XFASTINT (ch));
1291   if (!CHARSET_DEFINED_P (charset))
1292     invalid_character (XINT (ch));
1293   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1294 }
1295
1296 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1297   "Return number of characters between BEG and END.")
1298   (beg, end)
1299      Lisp_Object beg, end;
1300 {
1301   int from, to;
1302
1303   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1304   CHECK_NUMBER_COERCE_MARKER (end, 1);
1305
1306   from = min (XFASTINT (beg), XFASTINT (end));
1307   to = max (XFASTINT (beg), XFASTINT (end));
1308
1309   return make_number (to - from);
1310 }
1311
1312 /* Return the number of characters in the NBYTES bytes at PTR.
1313    This works by looking at the contents and checking for multibyte sequences.
1314    However, if the current buffer has enable-multibyte-characters = nil,
1315    we treat each byte as a character.  */
1316
1317 int
1318 chars_in_text (ptr, nbytes)
1319      unsigned char *ptr;
1320      int nbytes;
1321 {
1322   /* current_buffer is null at early stages of Emacs initialization.  */
1323   if (current_buffer == 0
1324       || NILP (current_buffer->enable_multibyte_characters))
1325     return nbytes;
1326
1327   return multibyte_chars_in_text (ptr, nbytes);
1328 }
1329
1330 /* Return the number of characters in the NBYTES bytes at PTR.
1331    This works by looking at the contents and checking for multibyte sequences.
1332    It ignores enable-multibyte-characters.  */
1333
1334 int
1335 multibyte_chars_in_text (ptr, nbytes)
1336      unsigned char *ptr;
1337      int nbytes;
1338 {
1339   unsigned char *endp;
1340   int chars, bytes;
1341
1342   endp = ptr + nbytes;
1343   chars = 0;
1344
1345   while (ptr < endp)
1346     {
1347       PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1348       ptr += bytes;
1349       chars++;
1350     }
1351
1352   return chars;
1353 }
1354
1355 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1356    count the numbers of characters and bytes in it.  On counting
1357    bytes, pay attention to that 8-bit characters in the range
1358    0x80..0x9F are represented by 2-byte in a multibyte text.  */
1359 void
1360 parse_str_as_multibyte (str, len, nchars, nbytes)
1361      unsigned char *str;
1362      int len, *nchars, *nbytes;
1363 {
1364   unsigned char *endp = str + len;
1365   int n, chars = 0, bytes = 0;
1366
1367   while (str < endp)
1368     {
1369       if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1370         str += n, bytes += n;
1371       else
1372         str++, bytes += 2;
1373       chars++;
1374     }
1375   *nchars = chars;
1376   *nbytes = bytes;
1377   return;
1378 }
1379
1380 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1381    It actually converts only 8-bit characters in the range 0x80..0x9F
1382    that don't contruct multibyte characters to multibyte forms.  If
1383    NCHARS is nonzero, set *NCHARS to the number of characters in the
1384    text.  It is assured that we can use LEN bytes at STR as a work
1385    area and that is enough.  Return the number of bytes of the
1386    resulting text.  */
1387
1388 int
1389 str_as_multibyte (str, len, nbytes, nchars)
1390      unsigned char *str;
1391      int len, nbytes, *nchars;
1392 {
1393   unsigned char *p = str, *endp = str + nbytes;
1394   unsigned char *to;
1395   int chars = 0;
1396   int n;
1397
1398   while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1399     p += n, chars++;
1400   if (nchars)
1401     *nchars = chars;
1402   if (p == endp)
1403     return nbytes;
1404
1405   to = p;
1406   nbytes = endp - p;
1407   endp = str + len;
1408   safe_bcopy (p, endp - nbytes, nbytes);
1409   p = endp - nbytes;
1410   while (p < endp)
1411     {
1412       if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1413         {
1414           while (n--)
1415             *to++ = *p++;
1416         }
1417       else
1418         {
1419           *to++ = LEADING_CODE_8_BIT_CONTROL;
1420           *to++ = *p++ + 0x20;
1421         }
1422       chars++;
1423     }
1424   if (nchars)
1425     *nchars = chars;
1426   return (to - str);
1427 }
1428
1429 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1430    that contains the same single-byte characters.  It actually
1431    converts all 8-bit characters to multibyte forms.  It is assured
1432    that we can use LEN bytes at STR as a work area and that is
1433    enough.  */
1434
1435 int
1436 str_to_multibyte (str, len, bytes)
1437      unsigned char *str;
1438      int len, bytes;
1439 {
1440   unsigned char *p = str, *endp = str + bytes;
1441   unsigned char *to;
1442   int c;
1443
1444   while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1445   if (p == endp)
1446     return bytes;
1447   to = p;
1448   bytes = endp - p;
1449   endp = str + len;
1450   safe_bcopy (p, endp - bytes, bytes);
1451   p = endp - bytes;
1452   while (p < endp)
1453     {
1454       if (*p < 0x80 || *p >= 0xA0)
1455         *to++ = *p++;
1456       else
1457         *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1458     }
1459   return (to - str);
1460 }
1461
1462 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
1463    actually converts only 8-bit characters in the range 0x80..0x9F to
1464    unibyte forms.  */
1465
1466 int
1467 str_as_unibyte (str, bytes)
1468      unsigned char *str;
1469      int bytes;
1470 {
1471   unsigned char *p = str, *endp = str + bytes;
1472   unsigned char *to = str;
1473
1474   while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1475   to = p;
1476   while (p < endp)
1477     {
1478       if (*p == LEADING_CODE_8_BIT_CONTROL)
1479         *to++ = *(p + 1) - 0x20, p += 2;
1480       else
1481         *to++ = *p++;
1482     }
1483   return (to - str);
1484 }
1485
1486 \f
1487 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1488   "Concatenate all the argument characters and make the result a string.")
1489   (n, args)
1490      int n;
1491      Lisp_Object *args;
1492 {
1493   int i;
1494   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1495   unsigned char *p = buf;
1496   int c;
1497
1498   for (i = 0; i < n; i++)
1499     {
1500       CHECK_NUMBER (args[i], 0);
1501       c = XINT (args[i]);
1502       p += CHAR_STRING (c, p);
1503     }
1504
1505   return make_string_from_bytes (buf, n, p - buf);
1506 }
1507
1508 #endif /* emacs */
1509 \f
1510 int
1511 charset_id_internal (charset_name)
1512      char *charset_name;
1513 {
1514   Lisp_Object val;
1515
1516   val= Fget (intern (charset_name), Qcharset);
1517   if (!VECTORP (val))
1518     error ("Charset %s is not defined", charset_name);
1519
1520   return (XINT (XVECTOR (val)->contents[0]));
1521 }
1522
1523 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1524        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1525    ()
1526 {
1527   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1528   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1529   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1530   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1531   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1532   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1533   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1534   return Qnil;
1535 }
1536
1537 void
1538 init_charset_once ()
1539 {
1540   int i, j, k;
1541
1542   staticpro (&Vcharset_table);
1543   staticpro (&Vcharset_symbol_table);
1544   staticpro (&Vgeneric_character_list);
1545
1546   /* This has to be done here, before we call Fmake_char_table.  */
1547   Qcharset_table = intern ("charset-table");
1548   staticpro (&Qcharset_table);
1549
1550   /* Intern this now in case it isn't already done.
1551      Setting this variable twice is harmless.
1552      But don't staticpro it here--that is done in alloc.c.  */
1553   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1554
1555   /* Now we are ready to set up this property, so we can
1556      create the charset table.  */
1557   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1558   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1559
1560   Qunknown = intern ("unknown");
1561   staticpro (&Qunknown);
1562   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1563                                         Qunknown);
1564
1565   /* Setup tables.  */
1566   for (i = 0; i < 2; i++)
1567     for (j = 0; j < 2; j++)
1568       for (k = 0; k < 128; k++)
1569         iso_charset_table [i][j][k] = -1;
1570
1571   for (i = 0; i < 256; i++)
1572     bytes_by_char_head[i] = 1;
1573   for (i = 128; i < MIN_CHARSET_OFFICIAL_DIMENSION2; i++)
1574     bytes_by_char_head[i] = 2;
1575   for (; i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1576     bytes_by_char_head[i] = 3;
1577   for (; i < 160; i++)
1578     bytes_by_char_head[i] = 2;
1579   bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1580   bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1581   bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1582   bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1583
1584   for (i = 0; i < 128; i++)
1585     width_by_char_head[i] = 1;
1586   for (; i < 256; i++)
1587     width_by_char_head[i] = 4;
1588   width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1589   width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1590   width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1591   width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1592
1593   {
1594     Lisp_Object val;
1595
1596     val = Qnil;
1597     for (i = 0x81; i < 0x90; i++)
1598       val = Fcons (make_number ((i - 0x70) << 7), val);
1599     for (; i < 0x9A; i++)
1600       val = Fcons (make_number ((i - 0x8F) << 14), val);
1601     for (i = 0xA0; i < 0xF0; i++)
1602       val = Fcons (make_number ((i - 0x70) << 7), val);
1603     for (; i < 0xFF; i++)
1604       val = Fcons (make_number ((i - 0xE0) << 14), val);
1605     Vgeneric_character_list = Fnreverse (val);
1606   }
1607
1608   nonascii_insert_offset = 0;
1609   Vnonascii_translation_table = Qnil;
1610 }
1611
1612 #ifdef emacs
1613
1614 void
1615 syms_of_charset ()
1616 {
1617   Qcharset = intern ("charset");
1618   staticpro (&Qcharset);
1619
1620   Qascii = intern ("ascii");
1621   staticpro (&Qascii);
1622
1623   Qeight_bit_control = intern ("eight-bit-control");
1624   staticpro (&Qeight_bit_control);
1625
1626   Qeight_bit_graphic = intern ("eight-bit-graphic");
1627   staticpro (&Qeight_bit_graphic);
1628
1629   /* Define special charsets ascii, eight-bit-control, and
1630      eight-bit-graphic.  */
1631   update_charset_table (make_number (CHARSET_ASCII),
1632                         make_number (1), make_number (94),
1633                         make_number (1),
1634                         make_number (0),
1635                         make_number ('B'),
1636                         make_number (0),
1637                         build_string ("ASCII"),
1638                         Qnil,   /* same as above */
1639                         build_string ("ASCII (ISO646 IRV)"));
1640   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1641   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1642
1643   update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1644                         make_number (1), make_number (96),
1645                         make_number (4),
1646                         make_number (0),
1647                         make_number (-1),
1648                         make_number (-1),
1649                         build_string ("8-bit control code (0x80..0x9F)"),
1650                         Qnil,   /* same as above */
1651                         Qnil);  /* same as above */
1652   CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1653   Fput (Qeight_bit_control, Qcharset,
1654         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1655
1656   update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1657                         make_number (1), make_number (96),
1658                         make_number (4),
1659                         make_number (0),
1660                         make_number (-1),
1661                         make_number (-1),
1662                         build_string ("8-bit graphic char (0xA0..0xFF)"),
1663                         Qnil,   /* same as above */
1664                         Qnil);  /* same as above */
1665   CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1666   Fput (Qeight_bit_graphic, Qcharset,
1667         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1668
1669   Qauto_fill_chars = intern ("auto-fill-chars");
1670   staticpro (&Qauto_fill_chars);
1671   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1672
1673   defsubr (&Sdefine_charset);
1674   defsubr (&Sgeneric_character_list);
1675   defsubr (&Sget_unused_iso_final_char);
1676   defsubr (&Sdeclare_equiv_charset);
1677   defsubr (&Sfind_charset_region);
1678   defsubr (&Sfind_charset_string);
1679   defsubr (&Smake_char_internal);
1680   defsubr (&Ssplit_char);
1681   defsubr (&Schar_charset);
1682   defsubr (&Scharset_after);
1683   defsubr (&Siso_charset);
1684   defsubr (&Schar_valid_p);
1685   defsubr (&Sunibyte_char_to_multibyte);
1686   defsubr (&Smultibyte_char_to_unibyte);
1687   defsubr (&Schar_bytes);
1688   defsubr (&Schar_width);
1689   defsubr (&Sstring_width);
1690   defsubr (&Schar_direction);
1691   defsubr (&Schars_in_region);
1692   defsubr (&Sstring);
1693   defsubr (&Ssetup_special_charsets);
1694
1695   DEFVAR_LISP ("charset-list", &Vcharset_list,
1696     "List of charsets ever defined.");
1697   Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1698                                         Fcons (Qeight_bit_graphic, Qnil)));
1699
1700   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1701     "Vector of cons cell of a symbol and translation table ever defined.\n\
1702 An ID of a translation table is an index of this vector.");
1703   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1704
1705   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1706     "Leading-code of private TYPE9N charset of column-width 1.");
1707   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1708
1709   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1710     "Leading-code of private TYPE9N charset of column-width 2.");
1711   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1712
1713   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1714     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1715   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1716
1717   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1718     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1719   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1720
1721   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1722     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1723 This is used for converting unibyte text to multibyte,\n\
1724 and for inserting character codes specified by number.\n\n\
1725 This serves to convert a Latin-1 or similar 8-bit character code\n\
1726 to the corresponding Emacs multibyte character code.\n\
1727 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1728 for your choice of character set.\n\
1729 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1730   nonascii_insert_offset = 0;
1731
1732   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1733     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1734 This is used for converting unibyte text to multibyte,\n\
1735 and for inserting character codes specified by number.\n\n\
1736 Conversion is performed only when multibyte characters are enabled,\n\
1737 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1738 to the corresponding Emacs character code.\n\n\
1739 If this is nil, `nonascii-insert-offset' is used instead.\n\
1740 See also the docstring of `make-translation-table'.");
1741   Vnonascii_translation_table = Qnil;
1742
1743   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1744     "A char-table for characters which invoke auto-filling.\n\
1745 Such characters have value t in this table.");
1746   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1747   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1748   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1749 }
1750
1751 #endif /* emacs */