code.delx.au - gnu-emacs/blob - src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  51 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  52 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  53 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  54
  55 /* Declaration of special charsets.  The values are set by
  56    Fsetup_special_charsets.  */
  57 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  58 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  59 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  60 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  61 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  62 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  63 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  64
  65 Lisp_Object Qcharset_table;
  66
  67 /* A char-table containing information of each character set.  */
  68 Lisp_Object Vcharset_table;
  69
  70 /* A vector of charset symbol indexed by charset-id.  This is used
  71    only for returning charset symbol from C functions.  */
  72 Lisp_Object Vcharset_symbol_table;
  73
  74 /* A list of charset symbols ever defined.  */
  75 Lisp_Object Vcharset_list;
  76
  77 /* Vector of translation table ever defined.
  78    ID of a translation table is used to index this vector.  */
  79 Lisp_Object Vtranslation_table_vector;
  80
  81 /* A char-table for characters which may invoke auto-filling.  */
  82 Lisp_Object Vauto_fill_chars;
  83
  84 Lisp_Object Qauto_fill_chars;
  85
  86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  87 int bytes_by_char_head[256];
  88 int width_by_char_head[256];
  89
  90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  91    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  92 int iso_charset_table[2][2][128];
  93
  94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  95 unsigned char *_fetch_multibyte_char_p;
  96 int _fetch_multibyte_char_len;
  97
  98 /* Offset to add to a non-ASCII value when inserting it.  */
  99 int nonascii_insert_offset;
 100
 101 /* Translation table for converting non-ASCII unibyte characters
 102    to multibyte codes, or nil.  */
 103 Lisp_Object Vnonascii_translation_table;
 104
 105 /* List of all possible generic characters.  */
 106 Lisp_Object Vgeneric_character_list;
 107
 108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse string STR of length LENGTH and fetch information of a
 119    character at STR.  Set BYTES to the byte length the character
 120    occupies, CHARSET, C1, C2 to proper values of the character. */
 121
 122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)             \
 123   do {                                                                       \
 124     (c1) = *(str);                                                           \
 125     (bytes) = BYTES_BY_CHAR_HEAD (c1);                                       \
 126     if ((bytes) == 1)                                                        \
 127       (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
 128     else if ((bytes) == 2)                                                   \
 129       {                                                                      \
 130         if ((c1) == LEADING_CODE_8_BIT_CONTROL)                              \
 131           (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20;         \
 132         else                                                                 \
 133           (charset) = (c1), (c1) = (str)[1] & 0x7F;                          \
 134       }                                                                      \
 135     else if ((bytes) == 3)                                                   \
 136       {                                                                      \
 137         if ((c1) < LEADING_CODE_PRIVATE_11)                                  \
 138           (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F;  \
 139         else                                                                 \
 140           (charset) = (str)[1], (c1) = (str)[2] & 0x7F;                      \
 141       }                                                                      \
 142     else                                                                     \
 143       (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F;  \
 144   } while (0)
 145
 146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 147 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 148   ((charset) == CHARSET_ASCII                           \
 149    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 150    : ((charset) == CHARSET_8_BIT_CONTROL                \
 151       ? ((c1) >= 0x80 && (c1) <= 0x9F)                  \
 152       : ((charset) == CHARSET_8_BIT_GRAPHIC             \
 153          ? ((c1) >= 0x80 && (c1) <= 0xFF)               \
 154          : (CHARSET_DIMENSION (charset) == 1            \
 155             ? ((c1) >= 0x20 && (c1) <= 0x7F)            \
 156             : ((c1) >= 0x20 && (c1) <= 0x7F             \
 157                && (c2) >= 0x20 && (c2) <= 0x7F)))))
 158
 159 /* Store multi-byte form of the character C in STR.  The caller should
 160    allocate at least 4-byte area at STR in advance.  Returns the
 161    length of the multi-byte form.  If C is an invalid character code,
 162    signal an error.
 163
 164    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 165    directly if C can be an ASCII character.  */
 166
 167 int
 168 char_to_string (c, str)
 169      int c;
 170      unsigned char *str;
 171 {
 172   unsigned char *p = str;
 173
 174   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 175     {
 176       /* Multibyte character can't have a modifier bit.  */
 177       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 178         invalid_character (c);
 179
 180       /* For Meta, Shift, and Control modifiers, we need special care.  */
 181       if (c & CHAR_META)
 182         {
 183           /* Move the meta bit to the right place for a string.  */
 184           c = (c & ~CHAR_META) | 0x80;
 185         }
 186       if (c & CHAR_SHIFT)
 187         {
 188           /* Shift modifier is valid only with [A-Za-z].  */
 189           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 190             c &= ~CHAR_SHIFT;
 191           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 192             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 193         }
 194       if (c & CHAR_CTL)
 195         {
 196           /* Simulate the code in lread.c.  */
 197           /* Allow `\C- ' and `\C-?'.  */
 198           if (c == (CHAR_CTL | ' '))
 199             c = 0;
 200           else if (c == (CHAR_CTL | '?'))
 201             c = 127;
 202           /* ASCII control chars are made from letters (both cases),
 203              as well as the non-letters within 0100...0137.  */
 204           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 205             c &= (037 | (~0177 & ~CHAR_CTL));
 206           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 207             c &= (037 | (~0177 & ~CHAR_CTL));
 208         }
 209
 210       /* If C still has any modifier bits, it is an invalid character.  */
 211       if (c & CHAR_MODIFIER_MASK)
 212         invalid_character (c);
 213     }
 214   if (SINGLE_BYTE_CHAR_P (c))
 215     {
 216       if (ASCII_BYTE_P (c) || c >= 0xA0)
 217         *p++ = c;
 218       else
 219         {
 220           *p++ = LEADING_CODE_8_BIT_CONTROL;
 221           *p++ = c + 0x20;
 222         }
 223     }
 224   else if (c < MAX_CHAR)
 225     {
 226       int charset, c1, c2;
 227
 228       SPLIT_CHAR (c, charset, c1, c2);
 229
 230       if (charset >= LEADING_CODE_EXT_11)
 231         *p++ = (charset < LEADING_CODE_EXT_12
 232                 ? LEADING_CODE_PRIVATE_11
 233                 : (charset < LEADING_CODE_EXT_21
 234                    ? LEADING_CODE_PRIVATE_12
 235                    : (charset < LEADING_CODE_EXT_22
 236                       ? LEADING_CODE_PRIVATE_21
 237                       : LEADING_CODE_PRIVATE_22)));
 238       *p++ = charset;
 239       if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 240         invalid_character (c);
 241       if (c1)
 242         {
 243           *p++ = c1 | 0x80;
 244           if (c2 > 0)
 245             *p++ = c2 | 0x80;
 246         }
 247     }
 248   else
 249     invalid_character (c);
 250
 251   return (p - str);
 252 }
 253
 254 /* Return the non-ASCII character corresponding to multi-byte form at
 255    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 256    length of the multibyte form in *ACTUAL_LEN.
 257
 258    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 259    this function directly if you want ot handle ASCII characters as
 260    well.  */
 261
 262 int
 263 string_to_char (str, len, actual_len)
 264      const unsigned char *str;
 265      int len, *actual_len;
 266 {
 267   int c, bytes, charset, c1, c2;
 268
 269   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 270   c = MAKE_CHAR (charset, c1, c2);
 271   if (actual_len)
 272     *actual_len = bytes;
 273   return c;
 274 }
 275
 276 /* Return the length of the multi-byte form at string STR of length LEN.
 277    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 278 int
 279 multibyte_form_length (str, len)
 280      const unsigned char *str;
 281      int len;
 282 {
 283   int bytes;
 284
 285   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 286   return bytes;
 287 }
 288
 289 /* Check multibyte form at string STR of length LEN and set variables
 290    pointed by CHARSET, C1, and C2 to charset and position codes of the
 291    character at STR, and return 0.  If there's no multibyte character,
 292    return -1.  This should be used only in the macro SPLIT_STRING
 293    which checks range of STR in advance.  */
 294
 295 int
 296 split_string (str, len, charset, c1, c2)
 297      const unsigned char *str;
 298      unsigned char *c1, *c2;
 299      int len, *charset;
 300 {
 301   register int bytes, cs, code1, code2 = -1;
 302
 303   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 304   if (cs == CHARSET_ASCII)
 305     return -1;
 306   *charset = cs;
 307   *c1 = code1;
 308   *c2 = code2;
 309   return 0;
 310 }
 311
 312 /* Return 1 iff character C has valid printable glyph.
 313    Use the macro CHAR_PRINTABLE_P instead.  */
 314 int
 315 char_printable_p (c)
 316      int c;
 317 {
 318   int charset, c1, c2, chars;
 319
 320   if (ASCII_BYTE_P (c))
 321     return 1;
 322   else if (SINGLE_BYTE_CHAR_P (c))
 323     return 0;
 324   else if (c >= MAX_CHAR)
 325     return 0;
 326
 327   SPLIT_CHAR (c, charset, c1, c2);
 328   if (! CHARSET_DEFINED_P (charset))
 329     return 0;
 330   if (CHARSET_CHARS (charset) == 94
 331       ? c1 <= 32 || c1 >= 127
 332       : c1 < 32)
 333     return 0;
 334   if (CHARSET_DIMENSION (charset) == 2
 335       && (CHARSET_CHARS (charset) == 94
 336           ? c2 <= 32 || c2 >= 127
 337           : c2 < 32))
 338     return 0;
 339   return 1;
 340 }
 341
 342 /* Translate character C by translation table TABLE.  If C
 343    is negative, translate a character specified by CHARSET, C1, and C2
 344    (C1 and C2 are code points of the character).  If no translation is
 345    found in TABLE, return C.  */
 346 int
 347 translate_char (table, c, charset, c1, c2)
 348      Lisp_Object table;
 349      int c, charset, c1, c2;
 350 {
 351   Lisp_Object ch;
 352   int alt_charset, alt_c1, alt_c2, dimension;
 353
 354   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 355   if (!CHAR_TABLE_P (table)
 356       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 357     return c;
 358
 359   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 360   dimension = CHARSET_DIMENSION (alt_charset);
 361   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 362     /* CH is not a generic character, just return it.  */
 363     return XFASTINT (ch);
 364
 365   /* Since CH is a generic character, we must return a specific
 366      charater which has the same position codes as C from CH.  */
 367   if (charset < 0)
 368     SPLIT_CHAR (c, charset, c1, c2);
 369   if (dimension != CHARSET_DIMENSION (charset))
 370     /* We can't make such a character because of dimension mismatch.  */
 371     return c;
 372   return MAKE_CHAR (alt_charset, c1, c2);
 373 }
 374
 375 /* Convert the unibyte character C to multibyte based on
 376    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 377    convert C to a valid multibyte character, convert it based on
 378    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 379
 380 int
 381 unibyte_char_to_multibyte (c)
 382      int c;
 383 {
 384   if (c < 0400 && c >= 0200)
 385     {
 386       int c_save = c;
 387
 388       if (! NILP (Vnonascii_translation_table))
 389         {
 390           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 391           if (c >= 0400 && ! char_valid_p (c, 0))
 392             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 393         }
 394       else if (c >= 0240 && nonascii_insert_offset > 0)
 395         {
 396           c += nonascii_insert_offset;
 397           if (c < 0400 || ! char_valid_p (c, 0))
 398             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 399         }
 400       else if (c >= 0240)
 401         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 402     }
 403   return c;
 404 }
 405
 406
 407 /* Convert the multibyte character C to unibyte 8-bit character based
 408    on Vnonascii_translation_table or nonascii_insert_offset.  If
 409    REV_TBL is non-nil, it should be a reverse table of
 410    Vnonascii_translation_table, i.e. what given by:
 411      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 412
 413 int
 414 multibyte_char_to_unibyte (c, rev_tbl)
 415      int c;
 416      Lisp_Object rev_tbl;
 417 {
 418   if (!SINGLE_BYTE_CHAR_P (c))
 419     {
 420       int c_save = c;
 421
 422       if (! CHAR_TABLE_P (rev_tbl)
 423           && CHAR_TABLE_P (Vnonascii_translation_table))
 424         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 425                                           make_number (0));
 426       if (CHAR_TABLE_P (rev_tbl))
 427         {
 428           Lisp_Object temp;
 429           temp = Faref (rev_tbl, make_number (c));
 430           if (INTEGERP (temp))
 431             c = XINT (temp);
 432           if (c >= 256)
 433             c = (c_save & 0177) + 0200;
 434         }
 435       else
 436         {
 437           if (nonascii_insert_offset > 0)
 438             c -= nonascii_insert_offset;
 439           if (c < 128 || c >= 256)
 440             c = (c_save & 0177) + 0200;
 441         }
 442     }
 443
 444   return c;
 445 }
 446
 447 \f
 448 /* Update the table Vcharset_table with the given arguments (see the
 449    document of `define-charset' for the meaning of each argument).
 450    Several other table contents are also updated.  The caller should
 451    check the validity of CHARSET-ID and the remaining arguments in
 452    advance.  */
 453
 454 void
 455 update_charset_table (charset_id, dimension, chars, width, direction,
 456                       iso_final_char, iso_graphic_plane,
 457                       short_name, long_name, description)
 458      Lisp_Object charset_id, dimension, chars, width, direction;
 459      Lisp_Object iso_final_char, iso_graphic_plane;
 460      Lisp_Object short_name, long_name, description;
 461 {
 462   int charset = XINT (charset_id);
 463   int bytes;
 464   unsigned char leading_code_base, leading_code_ext;
 465
 466   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 467     CHARSET_TABLE_ENTRY (charset)
 468       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 469
 470   /* Get byte length of multibyte form, base leading-code, and
 471      extended leading-code of the charset.  See the comment under the
 472      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 473   bytes = XINT (dimension);
 474   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 475     {
 476       /* Official charset, it doesn't have an extended leading-code.  */
 477       if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
 478         bytes += 1; /* For a base leading-code.  */
 479       leading_code_base = charset;
 480       leading_code_ext = 0;
 481     }
 482   else
 483     {
 484       /* Private charset.  */
 485       bytes += 2; /* For base and extended leading-codes.  */
 486       leading_code_base
 487         = (charset < LEADING_CODE_EXT_12
 488            ? LEADING_CODE_PRIVATE_11
 489            : (charset < LEADING_CODE_EXT_21
 490               ? LEADING_CODE_PRIVATE_12
 491               : (charset < LEADING_CODE_EXT_22
 492                  ? LEADING_CODE_PRIVATE_21
 493                  : LEADING_CODE_PRIVATE_22)));
 494       leading_code_ext = charset;
 495     }
 496
 497   if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
 498       &&BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 499     error ("Invalid dimension for the charset-ID %d", charset);
 500
 501   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 502   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 503   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 504   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 505   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 506   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 507   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 508     = make_number (leading_code_base);
 509   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 510     = make_number (leading_code_ext);
 511   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 512   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 513     = iso_graphic_plane;
 514   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 515   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 516   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 517   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 518
 519   {
 520     /* If we have already defined a charset which has the same
 521        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 522        DIRECTION, we must update the entry REVERSE-CHARSET of both
 523        charsets.  If there's no such charset, the value of the entry
 524        is set to nil.  */
 525     int i;
 526
 527     for (i = 0; i <= MAX_CHARSET; i++)
 528       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 529         {
 530           if (CHARSET_DIMENSION (i) == XINT (dimension)
 531               && CHARSET_CHARS (i) == XINT (chars)
 532               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 533               && CHARSET_DIRECTION (i) != XINT (direction))
 534             {
 535               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 536                 = make_number (i);
 537               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 538               break;
 539             }
 540         }
 541     if (i > MAX_CHARSET)
 542       /* No such a charset.  */
 543       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 544         = make_number (-1);
 545   }
 546
 547   if (charset != CHARSET_ASCII
 548       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 549     {
 550       width_by_char_head[leading_code_base] = XINT (width);
 551
 552       /* Update table emacs_code_class.  */
 553       emacs_code_class[charset] = (bytes == 2
 554                                    ? EMACS_leading_code_2
 555                                    : (bytes == 3
 556                                       ? EMACS_leading_code_3
 557                                       : EMACS_leading_code_4));
 558     }
 559
 560   /* Update table iso_charset_table.  */
 561   if (iso_final_char >= 0
 562       && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 563     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 564 }
 565
 566 #ifdef emacs
 567
 568 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 569    is invalid.  */
 570 int
 571 get_charset_id (charset_symbol)
 572      Lisp_Object charset_symbol;
 573 {
 574   Lisp_Object val;
 575   int charset;
 576
 577   return ((SYMBOLP (charset_symbol)
 578            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 579            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 580                CHARSET_VALID_P (charset)))
 581           ? charset : -1);
 582 }
 583
 584 /* Return an identification number for a new private charset of
 585    DIMENSION and WIDTH.  If there's no more room for the new charset,
 586    return 0.  */
 587 Lisp_Object
 588 get_new_private_charset_id (dimension, width)
 589      int dimension, width;
 590 {
 591   int charset, from, to;
 592
 593   if (dimension == 1)
 594     {
 595       if (width == 1)
 596         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 597       else
 598         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 599     }
 600   else
 601     {
 602       if (width == 1)
 603         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 604       else
 605         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 606     }
 607
 608   for (charset = from; charset < to; charset++)
 609     if (!CHARSET_DEFINED_P (charset)) break;
 610
 611   return make_number (charset < to ? charset : 0);
 612 }
 613
 614 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 615   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 616 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 617  treated as a private charset.\n\
 618 INFO-VECTOR is a vector of the format:\n\
 619    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 620     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 621 The meanings of each elements is as follows:\n\
 622 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 623 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 624 WIDTH (integer) is the number of columns a character in the charset\n\
 625 occupies on the screen: one of 0, 1, and 2.\n\
 626 \n\
 627 DIRECTION (integer) is the rendering direction of characters in the\n\
 628 charset when rendering.  If 0, render from left to right, else\n\
 629 render from right to left.\n\
 630 \n\
 631 ISO-FINAL-CHAR (character) is the final character of the\n\
 632 corresponding ISO 2022 charset.\n\
 633 It may be -1 if the charset is internal use only.\n\
 634 \n\
 635 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 636 while encoding to variants of ISO 2022 coding system, one of the\n\
 637 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 638 It may be -1 if the charset is internal use only.\n\
 639 \n\
 640 SHORT-NAME (string) is the short name to refer to the charset.\n\
 641 \n\
 642 LONG-NAME (string) is the long name to refer to the charset.\n\
 643 \n\
 644 DESCRIPTION (string) is the description string of the charset.")
 645   (charset_id, charset_symbol, info_vector)
 646      Lisp_Object charset_id, charset_symbol, info_vector;
 647 {
 648   Lisp_Object *vec;
 649
 650   if (!NILP (charset_id))
 651     CHECK_NUMBER (charset_id, 0);
 652   CHECK_SYMBOL (charset_symbol, 1);
 653   CHECK_VECTOR (info_vector, 2);
 654
 655   if (! NILP (charset_id))
 656     {
 657       if (! CHARSET_VALID_P (XINT (charset_id)))
 658         error ("Invalid CHARSET: %d", XINT (charset_id));
 659       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 660         error ("Already defined charset: %d", XINT (charset_id));
 661     }
 662
 663   vec = XVECTOR (info_vector)->contents;
 664   if (XVECTOR (info_vector)->size != 9
 665       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 666       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 667       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 668       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 669       || !INTEGERP (vec[4])
 670       || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 671       || !INTEGERP (vec[5])
 672       || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 673       || !STRINGP (vec[6])
 674       || !STRINGP (vec[7])
 675       || !STRINGP (vec[8]))
 676     error ("Invalid info-vector argument for defining charset %s",
 677            XSYMBOL (charset_symbol)->name->data);
 678
 679   if (NILP (charset_id))
 680     {
 681       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 682       if (XINT (charset_id) == 0)
 683         error ("There's no room for a new private charset %s",
 684                XSYMBOL (charset_symbol)->name->data);
 685     }
 686
 687   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 688                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 689   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 690   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 691   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 692   return Qnil;
 693 }
 694
 695 DEFUN ("generic-character-list", Fgeneric_character_list,
 696        Sgeneric_character_list, 0, 0, 0,
 697   "Return a list of all possible generic characters.\n\
 698 It includes a generic character for a charset not yet defined.")
 699   ()
 700 {
 701   return Vgeneric_character_list;
 702 }
 703
 704 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 705        Sget_unused_iso_final_char, 2, 2, 0,
 706   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 707 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 708 CHARS is the number of characters in a dimension: 94 or 96.\n\
 709 \n\
 710 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 711 If there's no unused final char for the specified kind of charset,\n\
 712 return nil.")
 713   (dimension, chars)
 714      Lisp_Object dimension, chars;
 715 {
 716   int final_char;
 717
 718   CHECK_NUMBER (dimension, 0);
 719   CHECK_NUMBER (chars, 1);
 720   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 721     error ("Invalid charset dimension %d, it should be 1 or 2",
 722            XINT (dimension));
 723   if (XINT (chars) != 94 && XINT (chars) != 96)
 724     error ("Invalid charset chars %d, it should be 94 or 96",
 725            XINT (chars));
 726   for (final_char = '0'; final_char <= '?'; final_char++)
 727     {
 728       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 729         break;
 730     }
 731   return (final_char <= '?' ? make_number (final_char) : Qnil);
 732 }
 733
 734 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 735        4, 4, 0,
 736   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 737 CHARSET should be defined by `defined-charset' in advance.")
 738   (dimension, chars, final_char, charset_symbol)
 739      Lisp_Object dimension, chars, final_char, charset_symbol;
 740 {
 741   int charset;
 742
 743   CHECK_NUMBER (dimension, 0);
 744   CHECK_NUMBER (chars, 1);
 745   CHECK_NUMBER (final_char, 2);
 746   CHECK_SYMBOL (charset_symbol, 3);
 747
 748   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 749     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 750   if (XINT (chars) != 94 && XINT (chars) != 96)
 751     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 752   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 753     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 754   if ((charset = get_charset_id (charset_symbol)) < 0)
 755     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 756
 757   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 758   return Qnil;
 759 }
 760
 761 /* Return information about charsets in the text at PTR of NBYTES
 762    bytes, which are NCHARS characters.  The value is:
 763
 764         0: Each character is represented by one byte.  This is always
 765            true for unibyte text.
 766         1: No charsets other than ascii eight-bit-control,
 767            eight-bit-graphic, and latin-1 are found.
 768         2: Otherwise.
 769
 770    In addition, if CHARSETS is nonzero, for each found charset N, set
 771    CHARSETS[N] to 1.  For that, callers should allocate CHARSETS
 772    (MAX_CHARSET + 1 elements) in advance.  It may lookup a translation
 773    table TABLE if supplied.  For invalid charsets, set CHARSETS[1] to
 774    1 (note that there's no charset whose ID is 1).  */
 775
 776 int
 777 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
 778      unsigned char *ptr;
 779      int nchars, nbytes, *charsets;
 780      Lisp_Object table;
 781 {
 782   if (nchars == nbytes)
 783     {
 784       if (charsets && nbytes > 0)
 785         {
 786           unsigned char *endp = ptr + nbytes;
 787           int maskbits = 0;
 788
 789           while (ptr < endp && maskbits != 7)
 790             {
 791               maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
 792               ptr++;
 793             }
 794
 795           if (maskbits & 1)
 796             charsets[CHARSET_ASCII] = 1;
 797           if (maskbits & 2)
 798             charsets[CHARSET_8_BIT_CONTROL] = 1;
 799           if (maskbits & 4)
 800             charsets[CHARSET_8_BIT_GRAPHIC] = 1;
 801         }
 802       return 0;
 803     }
 804   else
 805     {
 806       int return_val = 1;
 807       int bytes, charset, c1, c2;
 808
 809       if (! CHAR_TABLE_P (table))
 810         table = Qnil;
 811
 812       while (nchars-- > 0)
 813         {
 814           SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
 815           ptr += bytes;
 816
 817           if (!CHARSET_DEFINED_P (charset))
 818             charset = 1;
 819           else if (! NILP (table))
 820             {
 821               int c = translate_char (table, -1, charset, c1, c2);
 822               if (c >= 0)
 823                 charset = CHAR_CHARSET (c);
 824             }
 825
 826           if (return_val == 1
 827               && charset != CHARSET_ASCII
 828               && charset != CHARSET_8_BIT_CONTROL
 829               && charset != CHARSET_8_BIT_GRAPHIC
 830               && charset != charset_latin_iso8859_1)
 831             return_val = 2;
 832
 833           if (charsets)
 834             charsets[charset] = 1;
 835           else if (return_val == 2)
 836             break;
 837         }
 838       return return_val;
 839     }
 840 }
 841
 842 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 843        2, 3, 0,
 844   "Return a list of charsets in the region between BEG and END.\n\
 845 BEG and END are buffer positions.\n\
 846 Optional arg TABLE if non-nil is a translation table to look up.\n\
 847 \n\
 848 If the region contains invalid multiybte characters,\n\
 849 `unknown' is included in the returned list.\n\
 850 \n\
 851 If the current buffer is unibyte, the returned list may contain\n\
 852 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 853   (beg, end, table)
 854      Lisp_Object beg, end, table;
 855 {
 856   int charsets[MAX_CHARSET + 1];
 857   int from, from_byte, to, stop, stop_byte, i;
 858   Lisp_Object val;
 859
 860   validate_region (&beg, &end);
 861   from = XFASTINT (beg);
 862   stop = to = XFASTINT (end);
 863
 864   if (from < GPT && GPT < to)
 865     {
 866       stop = GPT;
 867       stop_byte = GPT_BYTE;
 868     }
 869   else
 870     stop_byte = CHAR_TO_BYTE (stop);
 871
 872   from_byte = CHAR_TO_BYTE (from);
 873
 874   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 875   while (1)
 876     {
 877       find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
 878                             stop_byte - from_byte, charsets, table);
 879       if (stop < to)
 880         {
 881           from = stop, from_byte = stop_byte;
 882           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 883         }
 884       else
 885         break;
 886     }
 887
 888   val = Qnil;
 889   if (charsets[1])
 890     val = Fcons (Qunknown, val);
 891   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 892     if (charsets[i])
 893       val = Fcons (CHARSET_SYMBOL (i), val);
 894   if (charsets[0])
 895     val = Fcons (Qascii, val);
 896   return val;
 897 }
 898
 899 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 900        1, 2, 0,
 901   "Return a list of charsets in STR.\n\
 902 Optional arg TABLE if non-nil is a translation table to look up.\n\
 903 \n\
 904 If the region contains invalid multiybte characters,\n\
 905 `unknown' is included in the returned list.\n\
 906 \n\
 907 If STR is unibyte, the returned list may contain\n\
 908 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 909   (str, table)
 910      Lisp_Object str, table;
 911 {
 912   int charsets[MAX_CHARSET + 1];
 913   int i;
 914   Lisp_Object val;
 915
 916   CHECK_STRING (str, 0);
 917
 918   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 919   find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
 920                         STRING_BYTES (XSTRING (str)), charsets, table);
 921
 922   val = Qnil;
 923   if (charsets[1])
 924     val = Fcons (Qunknown, val);
 925   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 926     if (charsets[i])
 927       val = Fcons (CHARSET_SYMBOL (i), val);
 928   if (charsets[0])
 929     val = Fcons (Qascii, val);
 930   return val;
 931 }
 932
 933 \f
 934 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 935   "")
 936   (charset, code1, code2)
 937      Lisp_Object charset, code1, code2;
 938 {
 939   int charset_id, c1, c2;
 940
 941   CHECK_NUMBER (charset, 0);
 942   charset_id = XINT (charset);
 943   if (!CHARSET_DEFINED_P (charset_id))
 944     error ("Invalid charset ID: %d", XINT (charset));
 945
 946   if (NILP (code1))
 947     c1 = 0;
 948   else
 949     {
 950       CHECK_NUMBER (code1, 1);
 951       c1 = XINT (code1);
 952     }
 953   if (NILP (code2))
 954     c2 = 0;
 955   else
 956     {
 957       CHECK_NUMBER (code2, 2);
 958       c2 = XINT (code2);
 959     }
 960
 961   if (charset_id == CHARSET_ASCII)
 962     {
 963       if (c1 < 0 || c1 > 0x7F)
 964         goto invalid_code_posints;
 965       return make_number (c1);
 966     }
 967   else if (charset_id == CHARSET_8_BIT_CONTROL)
 968     {
 969       if (c1 < 0x80 || c1 > 0x9F)
 970         goto invalid_code_posints;
 971       return make_number (c1);
 972     }
 973   else if (charset_id == CHARSET_8_BIT_GRAPHIC)
 974     {
 975       if (c1 < 0xA0 || c1 > 0xFF)
 976         goto invalid_code_posints;
 977       return make_number (c1);
 978     }
 979   else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
 980     goto invalid_code_posints;
 981   c1 &= 0x7F;
 982   c2 &= 0x7F;
 983   if (c1 == 0
 984       ? c2 != 0
 985       : (c2 == 0
 986          ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
 987          : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
 988     goto invalid_code_posints;
 989   return make_number (MAKE_CHAR (charset_id, c1, c2));
 990
 991  invalid_code_posints:
 992   error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 993 }
 994
 995 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
 996   "Return list of charset and one or two position-codes of CHAR.\n\
 997 If CHAR is invalid as a character code,\n\
 998 return a list of symbol `unknown' and CHAR.")
 999   (ch)
1000      Lisp_Object ch;
1001 {
1002   Lisp_Object val;
1003   int c, charset, c1, c2;
1004
1005   CHECK_NUMBER (ch, 0);
1006   c = XFASTINT (ch);
1007   if (!CHAR_VALID_P (c, 1))
1008     return Fcons (Qunknown, Fcons (ch, Qnil));
1009   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1010   return (c2 >= 0
1011           ? Fcons (CHARSET_SYMBOL (charset),
1012                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1013           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1014 }
1015
1016 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1017   "Return charset of CHAR.")
1018   (ch)
1019      Lisp_Object ch;
1020 {
1021   CHECK_NUMBER (ch, 0);
1022
1023   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1024 }
1025
1026 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1027   "Return charset of a character in the current buffer at position POS.\n\
1028 If POS is nil, it defauls to the current point.\n\
1029 If POS is out of range, the value is nil.")
1030   (pos)
1031      Lisp_Object pos;
1032 {
1033   Lisp_Object ch;
1034   int charset;
1035
1036   ch = Fchar_after (pos);
1037   if (! INTEGERP (ch))
1038     return ch;
1039   charset = CHAR_CHARSET (XINT (ch));
1040   return CHARSET_SYMBOL (charset);
1041 }
1042
1043 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1044   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1045 \n\
1046 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1047 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1048 where as Emacs distinguishes them by charset symbol.\n\
1049 See the documentation of the function `charset-info' for the meanings of\n\
1050 DIMENSION, CHARS, and FINAL-CHAR.")
1051   (dimension, chars, final_char)
1052      Lisp_Object dimension, chars, final_char;
1053 {
1054   int charset;
1055
1056   CHECK_NUMBER (dimension, 0);
1057   CHECK_NUMBER (chars, 1);
1058   CHECK_NUMBER (final_char, 2);
1059
1060   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1061     return Qnil;
1062   return CHARSET_SYMBOL (charset);
1063 }
1064
1065 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1066    generic character.  If GENERICP is zero, return nonzero iff C is a
1067    valid normal character.  Do not call this function directly,
1068    instead use macro CHAR_VALID_P.  */
1069 int
1070 char_valid_p (c, genericp)
1071      int c, genericp;
1072 {
1073   int charset, c1, c2;
1074
1075   if (c < 0)
1076     return 0;
1077   if (SINGLE_BYTE_CHAR_P (c))
1078     return 1;
1079   SPLIT_CHAR (c, charset, c1, c2);
1080   if (genericp)
1081     {
1082       if (c1)
1083         {
1084           if (c2 <= 0) c2 = 0x20;
1085         }
1086       else
1087         {
1088           if (c2 <= 0) c1 = c2 = 0x20;
1089         }
1090     }
1091   return (CHARSET_DEFINED_P (charset)
1092           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1093 }
1094
1095 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1096   "Return t if OBJECT is a valid normal character.\n\
1097 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1098 a valid generic character.")
1099   (object, genericp)
1100      Lisp_Object object, genericp;
1101 {
1102   if (! NATNUMP (object))
1103     return Qnil;
1104   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1105 }
1106
1107 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1108        Sunibyte_char_to_multibyte, 1, 1, 0,
1109   "Convert the unibyte character CH to multibyte character.\n\
1110 The conversion is done based on `nonascii-translation-table' (which see)\n\
1111  or `nonascii-insert-offset' (which see).")
1112   (ch)
1113      Lisp_Object ch;
1114 {
1115   int c;
1116
1117   CHECK_NUMBER (ch, 0);
1118   c = XINT (ch);
1119   if (c < 0 || c >= 0400)
1120     error ("Invalid unibyte character: %d", c);
1121   c = unibyte_char_to_multibyte (c);
1122   if (c < 0)
1123     error ("Can't convert to multibyte character: %d", XINT (ch));
1124   return make_number (c);
1125 }
1126
1127 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1128        Smultibyte_char_to_unibyte, 1, 1, 0,
1129   "Convert the multibyte character CH to unibyte character.\n\
1130 The conversion is done based on `nonascii-translation-table' (which see)\n\
1131  or `nonascii-insert-offset' (which see).")
1132   (ch)
1133      Lisp_Object ch;
1134 {
1135   int c;
1136
1137   CHECK_NUMBER (ch, 0);
1138   c = XINT (ch);
1139   if (! CHAR_VALID_P (c, 0))
1140     error ("Invalid multibyte character: %d", c);
1141   c = multibyte_char_to_unibyte (c, Qnil);
1142   if (c < 0)
1143     error ("Can't convert to unibyte character: %d", XINT (ch));
1144   return make_number (c);
1145 }
1146
1147 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1148   "Return 1 regardless of the argument CHAR.\n\
1149 This is now an obsolete function.  We keep it just for backward compatibility.")
1150   (ch)
1151      Lisp_Object ch;
1152 {
1153   Lisp_Object val;
1154
1155   CHECK_NUMBER (ch, 0);
1156   return make_number (1);
1157 }
1158
1159 /* Return how many bytes C will occupy in a multibyte buffer.
1160    Don't call this function directly, instead use macro CHAR_BYTES.  */
1161 int
1162 char_bytes (c)
1163      int c;
1164 {
1165   int charset;
1166
1167   if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1168     return 1;
1169   if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1170     return 1;
1171
1172   charset = CHAR_CHARSET (c);
1173   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1174 }
1175
1176 /* Return the width of character of which multi-byte form starts with
1177    C.  The width is measured by how many columns occupied on the
1178    screen when displayed in the current buffer.  */
1179
1180 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1181   (c < 0x20                                                             \
1182    ? (c == '\t'                                                         \
1183       ? XFASTINT (current_buffer->tab_width)                            \
1184       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1185    : (c < 0x7f                                                          \
1186       ? 1                                                               \
1187       : (c == 0x7F                                                      \
1188          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1189          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1190              && BASE_LEADING_CODE_P (c))                                \
1191             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1192             : 4))))
1193
1194 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1195   "Return width of CHAR when displayed in the current buffer.\n\
1196 The width is measured by how many columns it occupies on the screen.")
1197   (ch)
1198        Lisp_Object ch;
1199 {
1200   Lisp_Object val, disp;
1201   int c;
1202   struct Lisp_Char_Table *dp = buffer_display_table ();
1203
1204   CHECK_NUMBER (ch, 0);
1205
1206   c = XINT (ch);
1207
1208   /* Get the way the display table would display it.  */
1209   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1210
1211   if (VECTORP (disp))
1212     XSETINT (val, XVECTOR (disp)->size);
1213   else if (SINGLE_BYTE_CHAR_P (c))
1214     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1215   else
1216     {
1217       int charset = CHAR_CHARSET (c);
1218
1219       XSETFASTINT (val, CHARSET_WIDTH (charset));
1220     }
1221   return val;
1222 }
1223
1224 /* Return width of string STR of length LEN when displayed in the
1225    current buffer.  The width is measured by how many columns it
1226    occupies on the screen.  */
1227
1228 int
1229 strwidth (str, len)
1230      unsigned char *str;
1231      int len;
1232 {
1233   unsigned char *endp = str + len;
1234   int width = 0;
1235   struct Lisp_Char_Table *dp = buffer_display_table ();
1236
1237   while (str < endp)
1238     {
1239       Lisp_Object disp;
1240       int thislen;
1241       int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1242
1243       /* Get the way the display table would display it.  */
1244       if (dp)
1245         disp = DISP_CHAR_VECTOR (dp, c);
1246       else
1247         disp = Qnil;
1248
1249       if (VECTORP (disp))
1250         width += XVECTOR (disp)->size;
1251       else
1252         width += ONE_BYTE_CHAR_WIDTH (*str);
1253
1254       str += thislen;
1255     }
1256   return width;
1257 }
1258
1259 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1260   "Return width of STRING when displayed in the current buffer.\n\
1261 Width is measured by how many columns it occupies on the screen.\n\
1262 When calculating width of a multibyte character in STRING,\n\
1263 only the base leading-code is considered; the validity of\n\
1264 the following bytes is not checked.")
1265   (str)
1266      Lisp_Object str;
1267 {
1268   Lisp_Object val;
1269
1270   CHECK_STRING (str, 0);
1271   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1272                               STRING_BYTES (XSTRING (str))));
1273   return val;
1274 }
1275
1276 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1277   "Return the direction of CHAR.\n\
1278 The returned value is 0 for left-to-right and 1 for right-to-left.")
1279   (ch)
1280      Lisp_Object ch;
1281 {
1282   int charset;
1283
1284   CHECK_NUMBER (ch, 0);
1285   charset = CHAR_CHARSET (XFASTINT (ch));
1286   if (!CHARSET_DEFINED_P (charset))
1287     invalid_character (XINT (ch));
1288   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1289 }
1290
1291 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1292   "Return number of characters between BEG and END.")
1293   (beg, end)
1294      Lisp_Object beg, end;
1295 {
1296   int from, to;
1297
1298   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1299   CHECK_NUMBER_COERCE_MARKER (end, 1);
1300
1301   from = min (XFASTINT (beg), XFASTINT (end));
1302   to = max (XFASTINT (beg), XFASTINT (end));
1303
1304   return make_number (to - from);
1305 }
1306
1307 /* Return the number of characters in the NBYTES bytes at PTR.
1308    This works by looking at the contents and checking for multibyte sequences.
1309    However, if the current buffer has enable-multibyte-characters = nil,
1310    we treat each byte as a character.  */
1311
1312 int
1313 chars_in_text (ptr, nbytes)
1314      unsigned char *ptr;
1315      int nbytes;
1316 {
1317   /* current_buffer is null at early stages of Emacs initialization.  */
1318   if (current_buffer == 0
1319       || NILP (current_buffer->enable_multibyte_characters))
1320     return nbytes;
1321
1322   return multibyte_chars_in_text (ptr, nbytes);
1323 }
1324
1325 /* Return the number of characters in the NBYTES bytes at PTR.
1326    This works by looking at the contents and checking for multibyte sequences.
1327    It ignores enable-multibyte-characters.  */
1328
1329 int
1330 multibyte_chars_in_text (ptr, nbytes)
1331      unsigned char *ptr;
1332      int nbytes;
1333 {
1334   unsigned char *endp;
1335   int chars, bytes;
1336
1337   endp = ptr + nbytes;
1338   chars = 0;
1339
1340   while (ptr < endp)
1341     {
1342       PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1343       ptr += bytes;
1344       chars++;
1345     }
1346
1347   return chars;
1348 }
1349
1350 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1351    count the numbers of characters and bytes in it.  On counting
1352    bytes, pay attention to that 8-bit characters in the range
1353    0x80..0x9F are represented by 2-byte in a multibyte text.  */
1354 void
1355 parse_str_as_multibyte (str, len, nchars, nbytes)
1356      unsigned char *str;
1357      int len, *nchars, *nbytes;
1358 {
1359   unsigned char *endp = str + len;
1360   int n, chars = 0, bytes = 0;
1361
1362   while (str < endp)
1363     {
1364       if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1365         str += n, bytes += n;
1366       else
1367         str++, bytes += 2;
1368       chars++;
1369     }
1370   *nchars = chars;
1371   *nbytes = bytes;
1372   return;
1373 }
1374
1375 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1376    It actually converts only 8-bit characters in the range 0x80..0x9F
1377    that don't contruct multibyte characters to multibyte forms.  If
1378    NCHARS is nonzero, set *NCHARS to the number of characters in the
1379    text.  It is assured that we can use LEN bytes at STR as a work
1380    area and that is enough.  Return the number of bytes of the
1381    resulting text.  */
1382
1383 int
1384 str_as_multibyte (str, len, nbytes, nchars)
1385      unsigned char *str;
1386      int len, nbytes, *nchars;
1387 {
1388   unsigned char *p = str, *endp = str + nbytes;
1389   unsigned char *to;
1390   int chars = 0;
1391   int n;
1392
1393   while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1394     p += n, chars++;
1395   if (nchars)
1396     *nchars = chars;
1397   if (p == endp)
1398     return nbytes;
1399
1400   to = p;
1401   nbytes = endp - p;
1402   endp = str + len;
1403   safe_bcopy (p, endp - nbytes, nbytes);
1404   p = endp - nbytes;
1405   while (p < endp)
1406     {
1407       if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1408         {
1409           while (n--)
1410             *to++ = *p++;
1411         }
1412       else
1413         {
1414           *to++ = LEADING_CODE_8_BIT_CONTROL;
1415           *to++ = *p++ + 0x20;
1416         }
1417       chars++;
1418     }
1419   if (nchars)
1420     *nchars = chars;
1421   return (to - str);
1422 }
1423
1424 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1425    that contains the same single-byte characters.  It actually
1426    converts all 8-bit characters to multibyte forms.  It is assured
1427    that we can use LEN bytes at STR as a work area and that is
1428    enough.  */
1429
1430 int
1431 str_to_multibyte (str, len, bytes)
1432      unsigned char *str;
1433      int len, bytes;
1434 {
1435   unsigned char *p = str, *endp = str + bytes;
1436   unsigned char *to;
1437   int c;
1438
1439   while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1440   if (p == endp)
1441     return bytes;
1442   to = p;
1443   bytes = endp - p;
1444   endp = str + len;
1445   safe_bcopy (p, endp - bytes, bytes);
1446   p = endp - bytes;
1447   while (p < endp)
1448     {
1449       if (*p < 0x80 || *p >= 0xA0)
1450         *to++ = *p++;
1451       else
1452         *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1453     }
1454   return (to - str);
1455 }
1456
1457 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
1458    actually converts only 8-bit characters in the range 0x80..0x9F to
1459    unibyte forms.  */
1460
1461 int
1462 str_as_unibyte (str, bytes)
1463      unsigned char *str;
1464      int bytes;
1465 {
1466   unsigned char *p = str, *endp = str + bytes;
1467   unsigned char *to = str;
1468
1469   while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1470   to = p;
1471   while (p < endp)
1472     {
1473       if (*p == LEADING_CODE_8_BIT_CONTROL)
1474         *to++ = *(p + 1) - 0x20, p += 2;
1475       else
1476         *to++ = *p++;
1477     }
1478   return (to - str);
1479 }
1480
1481 \f
1482 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1483   "Concatenate all the argument characters and make the result a string.")
1484   (n, args)
1485      int n;
1486      Lisp_Object *args;
1487 {
1488   int i;
1489   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1490   unsigned char *p = buf;
1491   int c;
1492
1493   for (i = 0; i < n; i++)
1494     {
1495       CHECK_NUMBER (args[i], 0);
1496       c = XINT (args[i]);
1497       p += CHAR_STRING (c, p);
1498     }
1499
1500   return make_string_from_bytes (buf, n, p - buf);
1501 }
1502
1503 #endif /* emacs */
1504 \f
1505 int
1506 charset_id_internal (charset_name)
1507      char *charset_name;
1508 {
1509   Lisp_Object val;
1510
1511   val= Fget (intern (charset_name), Qcharset);
1512   if (!VECTORP (val))
1513     error ("Charset %s is not defined", charset_name);
1514
1515   return (XINT (XVECTOR (val)->contents[0]));
1516 }
1517
1518 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1519        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1520    ()
1521 {
1522   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1523   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1524   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1525   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1526   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1527   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1528   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1529   return Qnil;
1530 }
1531
1532 void
1533 init_charset_once ()
1534 {
1535   int i, j, k;
1536
1537   staticpro (&Vcharset_table);
1538   staticpro (&Vcharset_symbol_table);
1539   staticpro (&Vgeneric_character_list);
1540
1541   /* This has to be done here, before we call Fmake_char_table.  */
1542   Qcharset_table = intern ("charset-table");
1543   staticpro (&Qcharset_table);
1544
1545   /* Intern this now in case it isn't already done.
1546      Setting this variable twice is harmless.
1547      But don't staticpro it here--that is done in alloc.c.  */
1548   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1549
1550   /* Now we are ready to set up this property, so we can
1551      create the charset table.  */
1552   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1553   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1554
1555   Qunknown = intern ("unknown");
1556   staticpro (&Qunknown);
1557   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1558                                         Qunknown);
1559
1560   /* Setup tables.  */
1561   for (i = 0; i < 2; i++)
1562     for (j = 0; j < 2; j++)
1563       for (k = 0; k < 128; k++)
1564         iso_charset_table [i][j][k] = -1;
1565
1566   for (i = 0; i < 256; i++)
1567     bytes_by_char_head[i] = 1;
1568   for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1569        i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1570     bytes_by_char_head[i] = 2;
1571   for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1572        i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1573     bytes_by_char_head[i] = 3;
1574   bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1575   bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1576   bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1577   bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1578   bytes_by_char_head[LEADING_CODE_8_BIT_CONTROL] = 2;
1579
1580   for (i = 0; i < 128; i++)
1581     width_by_char_head[i] = 1;
1582   for (; i < 256; i++)
1583     width_by_char_head[i] = 4;
1584   width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1585   width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1586   width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1587   width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1588
1589   {
1590     Lisp_Object val;
1591
1592     val = Qnil;
1593     for (i = 0x81; i < 0x90; i++)
1594       val = Fcons (make_number ((i - 0x70) << 7), val);
1595     for (; i < 0x9A; i++)
1596       val = Fcons (make_number ((i - 0x8F) << 14), val);
1597     for (i = 0xA0; i < 0xF0; i++)
1598       val = Fcons (make_number ((i - 0x70) << 7), val);
1599     for (; i < 0xFF; i++)
1600       val = Fcons (make_number ((i - 0xE0) << 14), val);
1601     Vgeneric_character_list = Fnreverse (val);
1602   }
1603
1604   nonascii_insert_offset = 0;
1605   Vnonascii_translation_table = Qnil;
1606 }
1607
1608 #ifdef emacs
1609
1610 void
1611 syms_of_charset ()
1612 {
1613   Qcharset = intern ("charset");
1614   staticpro (&Qcharset);
1615
1616   Qascii = intern ("ascii");
1617   staticpro (&Qascii);
1618
1619   Qeight_bit_control = intern ("eight-bit-control");
1620   staticpro (&Qeight_bit_control);
1621
1622   Qeight_bit_graphic = intern ("eight-bit-graphic");
1623   staticpro (&Qeight_bit_graphic);
1624
1625   /* Define special charsets ascii, eight-bit-control, and
1626      eight-bit-graphic.  */
1627   update_charset_table (make_number (CHARSET_ASCII),
1628                         make_number (1), make_number (94),
1629                         make_number (1),
1630                         make_number (0),
1631                         make_number ('B'),
1632                         make_number (0),
1633                         build_string ("ASCII"),
1634                         build_string ("ASCII"),
1635                         build_string ("ASCII (ISO646 IRV)"));
1636   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1637   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1638
1639   update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1640                         make_number (1), make_number (96),
1641                         make_number (4),
1642                         make_number (0),
1643                         make_number (-1),
1644                         make_number (-1),
1645                         build_string ("8-bit control code (0x80..0x9F)"),
1646                         build_string ("8-bit control code (0x80..0x9F)"),
1647                         build_string ("8-bit control code (0x80..0x9F)"));
1648   CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1649   Fput (Qeight_bit_control, Qcharset,
1650         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1651
1652   update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1653                         make_number (1), make_number (96),
1654                         make_number (4),
1655                         make_number (0),
1656                         make_number (-1),
1657                         make_number (-1),
1658                         build_string ("8-bit graphic char"),
1659                         build_string ("8-bit graphic char (0xA0..0xFF)"),
1660                         build_string ("8-bit graphic char (0xA0..0xFF)"));
1661   CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1662   Fput (Qeight_bit_graphic, Qcharset,
1663         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1664
1665   Qauto_fill_chars = intern ("auto-fill-chars");
1666   staticpro (&Qauto_fill_chars);
1667   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1668
1669   defsubr (&Sdefine_charset);
1670   defsubr (&Sgeneric_character_list);
1671   defsubr (&Sget_unused_iso_final_char);
1672   defsubr (&Sdeclare_equiv_charset);
1673   defsubr (&Sfind_charset_region);
1674   defsubr (&Sfind_charset_string);
1675   defsubr (&Smake_char_internal);
1676   defsubr (&Ssplit_char);
1677   defsubr (&Schar_charset);
1678   defsubr (&Scharset_after);
1679   defsubr (&Siso_charset);
1680   defsubr (&Schar_valid_p);
1681   defsubr (&Sunibyte_char_to_multibyte);
1682   defsubr (&Smultibyte_char_to_unibyte);
1683   defsubr (&Schar_bytes);
1684   defsubr (&Schar_width);
1685   defsubr (&Sstring_width);
1686   defsubr (&Schar_direction);
1687   defsubr (&Schars_in_region);
1688   defsubr (&Sstring);
1689   defsubr (&Ssetup_special_charsets);
1690
1691   DEFVAR_LISP ("charset-list", &Vcharset_list,
1692     "List of charsets ever defined.");
1693   Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1694                                         Fcons (Qeight_bit_graphic, Qnil)));
1695
1696   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1697     "Vector of cons cell of a symbol and translation table ever defined.\n\
1698 An ID of a translation table is an index of this vector.");
1699   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1700
1701   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1702     "Leading-code of private TYPE9N charset of column-width 1.");
1703   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1704
1705   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1706     "Leading-code of private TYPE9N charset of column-width 2.");
1707   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1708
1709   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1710     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1711   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1712
1713   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1714     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1715   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1716
1717   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1718     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1719 This is used for converting unibyte text to multibyte,\n\
1720 and for inserting character codes specified by number.\n\n\
1721 This serves to convert a Latin-1 or similar 8-bit character code\n\
1722 to the corresponding Emacs multibyte character code.\n\
1723 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1724 for your choice of character set.\n\
1725 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1726   nonascii_insert_offset = 0;
1727
1728   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1729     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1730 This is used for converting unibyte text to multibyte,\n\
1731 and for inserting character codes specified by number.\n\n\
1732 Conversion is performed only when multibyte characters are enabled,\n\
1733 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1734 to the corresponding Emacs character code.\n\n\
1735 If this is nil, `nonascii-insert-offset' is used instead.\n\
1736 See also the docstring of `make-translation-table'.");
1737   Vnonascii_translation_table = Qnil;
1738
1739   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1740     "A char-table for characters which invoke auto-filling.\n\
1741 Such characters have value t in this table.");
1742   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1743   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1744   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1745 }
1746
1747 #endif /* emacs */