code.delx.au - gnu-emacs/blob - src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii, Qcomposition;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_composition;   /* for composite characters */
  51 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  52 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  53 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  54 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  55
  56 /* Declaration of special charsets.  */
  57 int charset_ascii;              /* ASCII */
  58 int charset_composition;        /* for a composite character */
  59 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  60 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  61 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  62 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  63 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  64 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  65 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  66
  67 int min_composite_char;
  68
  69 Lisp_Object Qcharset_table;
  70
  71 /* A char-table containing information of each character set.  */
  72 Lisp_Object Vcharset_table;
  73
  74 /* A vector of charset symbol indexed by charset-id.  This is used
  75    only for returning charset symbol from C functions.  */
  76 Lisp_Object Vcharset_symbol_table;
  77
  78 /* A list of charset symbols ever defined.  */
  79 Lisp_Object Vcharset_list;
  80
  81 /* Vector of translation table ever defined.
  82    ID of a translation table is used to index this vector.  */
  83 Lisp_Object Vtranslation_table_vector;
  84
  85 /* A char-table for characters which may invoke auto-filling.  */
  86 Lisp_Object Vauto_fill_chars;
  87
  88 Lisp_Object Qauto_fill_chars;
  89
  90 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  91 int bytes_by_char_head[256];
  92 int width_by_char_head[256];
  93
  94 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  95    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  96 int iso_charset_table[2][2][128];
  97
  98 /* Table of pointers to the structure `cmpchar_info' indexed by
  99    CMPCHAR-ID.  */
 100 struct cmpchar_info **cmpchar_table;
 101 /* The current size of `cmpchar_table'.  */
 102 static int cmpchar_table_size;
 103 /* Number of the current composite characters.  */
 104 int n_cmpchars;
 105
 106 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
 107 unsigned char *_fetch_multibyte_char_p;
 108 int _fetch_multibyte_char_len;
 109
 110 /* Offset to add to a non-ASCII value when inserting it.  */
 111 int nonascii_insert_offset;
 112
 113 /* Translation table for converting non-ASCII unibyte characters
 114    to multibyte codes, or nil.  */
 115 Lisp_Object Vnonascii_translation_table;
 116
 117 /* List of all possible generic characters.  */
 118 Lisp_Object Vgeneric_character_list;
 119
 120 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 121 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 122 \f
 123 void
 124 invalid_character (c)
 125      int c;
 126 {
 127   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 128 }
 129
 130 /* Parse composite character string STR of length LENGTH (>= 2) and
 131    set BYTES, CHARSET, C1, and C2 as below.
 132
 133    It is assumed that *STR is LEADING_CODE_COMPOSITION and the
 134    following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
 135
 136    If there is a valid composite character, set CHARSET, C1, and C2 to
 137    such values that MAKE_CHAR can make the composite character from
 138    them.  Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
 139    second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
 140    the invalid multibyte character whose string representation is two
 141    bytes of STR[0] and STR[1].  In any case, set BYTES to LENGTH.
 142
 143    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 144
 145 #define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2)        \
 146   do {                                                                  \
 147     int cmpchar_id = str_cmpchar_id ((str), (length));                  \
 148                                                                         \
 149     (charset) = CHARSET_COMPOSITION;                                    \
 150     (bytes) = (length);                                                 \
 151     if (cmpchar_id >= 0)                                                \
 152       {                                                                 \
 153         (c1) = CHAR_FIELD2 (cmpchar_id);                                \
 154         (c2) = CHAR_FIELD3 (cmpchar_id);                                \
 155       }                                                                 \
 156     else                                                                \
 157       {                                                                 \
 158         (c1) = (str)[1] & 0x7F;                                         \
 159         (c2) = -1;                                                      \
 160       }                                                                 \
 161   } while (0)
 162
 163 /* Parse non-composite multibyte character string STR of length LENGTH
 164    (>= 2) and set BYTES to the length of actual multibyte sequence,
 165    CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
 166    multibyte character from them.
 167
 168    It is assumed that *STR is one of base leading codes (excluding
 169    LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
 170    satisfy !CHAR_HEAD_P.
 171
 172    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 173
 174 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2)        \
 175   do {                                                                  \
 176     (bytes) = 1;                                                        \
 177     (charset) = (str)[0];                                               \
 178     if ((charset) >= LEADING_CODE_PRIVATE_11                            \
 179         && (charset) <= LEADING_CODE_PRIVATE_22)                        \
 180       (charset) = (str)[(bytes)++];                                     \
 181     if ((bytes) < (length))                                             \
 182       {                                                                 \
 183         (c1) = (str)[(bytes)++] & 0x7F;                                 \
 184         if ((bytes) < (length))                                         \
 185           (c2) = (str)[(bytes)++] & 0x7F;                               \
 186         else                                                            \
 187           (c2) = -1;                                                    \
 188       }                                                                 \
 189     else                                                                \
 190       (c1) = (c2) = -1;                                                 \
 191   } while (0)
 192
 193 /* Parse string STR of length LENGTH and check if a multibyte
 194    characters is at STR.  set BYTES to the actual length, CHARSET, C1,
 195    C2 to proper values for that character.  */
 196
 197 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)        \
 198   do {                                                                  \
 199     int i;                                                              \
 200     if (ASCII_BYTE_P ((str)[0]))                                        \
 201       i = 1;                                                            \
 202     else                                                                \
 203       for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++);       \
 204     if (i == 1)                                                         \
 205       (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ;         \
 206     else if ((str)[0] == LEADING_CODE_COMPOSITION)                      \
 207       SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2);             \
 208     else                                                                \
 209       {                                                                 \
 210         if (i > BYTES_BY_CHAR_HEAD ((str)[0]))                          \
 211           i = BYTES_BY_CHAR_HEAD ((str)[0]);                            \
 212         SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2);           \
 213       }                                                                 \
 214   } while (0)
 215
 216 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 217 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 218   (charset == CHARSET_ASCII                             \
 219    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 220    : (CHARSET_DIMENSION (charset) == 1                  \
 221       ? ((c1) >= 0x20 && (c1) <= 0x7F)                  \
 222       : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
 223
 224 /* Set STR a pointer to the multi-byte form of the character C.  If C
 225    is not a composite character, the multi-byte form is set in WORKBUF
 226    and STR points WORKBUF.  The caller should allocate at least 4-byte
 227    area at WORKBUF in advance.  Returns the length of the multi-byte
 228    form.  If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
 229    and return 1.
 230
 231    Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
 232    function directly if C can be an ASCII character.  */
 233
 234 int
 235 non_ascii_char_to_string (c, workbuf, str)
 236      int c;
 237      unsigned char *workbuf, **str;
 238 {
 239   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 240     {
 241       /* Multibyte character can't have a modifier bit.  */
 242       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 243         invalid_character (c);
 244
 245       /* For Meta, Shift, and Control modifiers, we need special care.  */
 246       if (c & CHAR_META)
 247         {
 248           /* Move the meta bit to the right place for a string.  */
 249           c = (c & ~CHAR_META) | 0x80;
 250         }
 251       if (c & CHAR_SHIFT)
 252         {
 253           /* Shift modifier is valid only with [A-Za-z].  */
 254           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 255             c &= ~CHAR_SHIFT;
 256           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 257             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 258         }
 259       if (c & CHAR_CTL)
 260         {
 261           /* Simulate the code in lread.c.  */
 262           /* Allow `\C- ' and `\C-?'.  */
 263           if (c == (CHAR_CTL | ' '))
 264             c = 0;
 265           else if (c == (CHAR_CTL | '?'))
 266             c = 127;
 267           /* ASCII control chars are made from letters (both cases),
 268              as well as the non-letters within 0100...0137.  */
 269           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 270             c &= (037 | (~0177 & ~CHAR_CTL));
 271           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 272             c &= (037 | (~0177 & ~CHAR_CTL));
 273         }
 274
 275       /* If C still has any modifier bits, it is an invalid character.  */
 276       if (c & CHAR_MODIFIER_MASK)
 277         invalid_character (c);
 278
 279       *str = workbuf;
 280       *workbuf++ = c;
 281     }
 282   else
 283     {
 284       int charset, c1, c2;
 285
 286       SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 287       if (charset == CHARSET_COMPOSITION)
 288         {
 289           if (c >= MAX_CHAR)
 290             invalid_character (c);
 291           if (c >= MIN_CHAR_COMPOSITION)
 292             {
 293               /* Valid composite character.  */
 294               *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
 295               workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
 296             }
 297           else
 298             {
 299               /* Invalid but can have multibyte form.  */
 300               *str = workbuf;
 301               *workbuf++ = LEADING_CODE_COMPOSITION;
 302               *workbuf++ = c1 | 0x80;
 303             }
 304         }
 305       else if (charset > CHARSET_COMPOSITION)
 306         {
 307           *str = workbuf;
 308           if (charset >= LEADING_CODE_EXT_11)
 309             *workbuf++ = (charset < LEADING_CODE_EXT_12
 310                           ? LEADING_CODE_PRIVATE_11
 311                           : (charset < LEADING_CODE_EXT_21
 312                              ? LEADING_CODE_PRIVATE_12
 313                              : (charset < LEADING_CODE_EXT_22
 314                                 ? LEADING_CODE_PRIVATE_21
 315                                 : LEADING_CODE_PRIVATE_22)));
 316           *workbuf++ = charset;
 317           if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 318             invalid_character (c);
 319           if (c1)
 320             {
 321               *workbuf++ = c1 | 0x80;
 322               if (c2 > 0)
 323                 *workbuf++ = c2 | 0x80;
 324             }
 325         }
 326       else if (charset == CHARSET_ASCII)
 327         *workbuf++= c & 0x7F;
 328       else
 329         invalid_character (c);
 330     }
 331
 332   return (workbuf - *str);
 333 }
 334
 335 /* Return the non-ASCII character corresponding to multi-byte form at
 336    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 337    length of the multibyte form in *ACTUAL_LEN.
 338
 339    Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
 340    directly if you want ot handle ASCII characters as well.  */
 341
 342 int
 343 string_to_non_ascii_char (str, len, actual_len)
 344      const unsigned char *str;
 345      int len, *actual_len;
 346 {
 347   int c, bytes, charset, c1, c2;
 348
 349   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 350   c = MAKE_CHAR (charset, c1, c2);
 351   if (actual_len)
 352     *actual_len = bytes;
 353   return c;
 354 }
 355
 356 /* Return the length of the multi-byte form at string STR of length LEN.
 357    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 358 int
 359 multibyte_form_length (str, len)
 360      const unsigned char *str;
 361      int len;
 362 {
 363   int bytes;
 364
 365   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 366   return bytes;
 367 }
 368
 369 /* Check multibyte form at string STR of length LEN and set variables
 370    pointed by CHARSET, C1, and C2 to charset and position codes of the
 371    character at STR, and return 0.  If there's no multibyte character,
 372    return -1.  This should be used only in the macro SPLIT_STRING
 373    which checks range of STR in advance.  */
 374
 375 int
 376 split_non_ascii_string (str, len, charset, c1, c2)
 377      const unsigned char *str;
 378      unsigned char *c1, *c2;
 379      int len, *charset;
 380 {
 381   register int bytes, cs, code1, code2 = -1;
 382
 383   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 384   if (cs == CHARSET_ASCII)
 385     return -1;
 386   *charset = cs;
 387   *c1 = code1;
 388   *c2 = code2;
 389   return 0;
 390 }
 391
 392 /* Return 1 iff character C has valid printable glyph.
 393    Use the macro CHAR_PRINTABLE_P instead.  */
 394 int
 395 char_printable_p (c)
 396      int c;
 397 {
 398   int charset, c1, c2, chars;
 399
 400   if (SINGLE_BYTE_CHAR_P (c))
 401     return 1;
 402   if (c >= MIN_CHAR_COMPOSITION)
 403     return (c < MAX_CHAR);
 404
 405   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 406   if (! CHARSET_DEFINED_P (charset))
 407     return 0;
 408   if (CHARSET_CHARS (charset) == 94
 409       ? c1 <= 32 || c1 >= 127
 410       : c1 < 32)
 411     return 0;
 412   if (CHARSET_DIMENSION (charset) == 2
 413       && (CHARSET_CHARS (charset) == 94
 414           ? c2 <= 32 || c2 >= 127
 415           : c2 < 32))
 416     return 0;
 417   return 1;
 418 }
 419
 420 /* Translate character C by translation table TABLE.  If C
 421    is negative, translate a character specified by CHARSET, C1, and C2
 422    (C1 and C2 are code points of the character).  If no translation is
 423    found in TABLE, return C.  */
 424 int
 425 translate_char (table, c, charset, c1, c2)
 426      Lisp_Object table;
 427      int c, charset, c1, c2;
 428 {
 429   Lisp_Object ch;
 430   int alt_charset, alt_c1, alt_c2, dimension;
 431
 432   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 433   if (!CHAR_TABLE_P (table)
 434       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 435     return c;
 436
 437   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 438   dimension = CHARSET_DIMENSION (alt_charset);
 439   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 440     /* CH is not a generic character, just return it.  */
 441     return XFASTINT (ch);
 442
 443   /* Since CH is a generic character, we must return a specific
 444      charater which has the same position codes as C from CH.  */
 445   if (charset < 0)
 446     SPLIT_CHAR (c, charset, c1, c2);
 447   if (dimension != CHARSET_DIMENSION (charset))
 448     /* We can't make such a character because of dimension mismatch.  */
 449     return c;
 450   return MAKE_CHAR (alt_charset, c1, c2);
 451 }
 452
 453 /* Convert the unibyte character C to multibyte based on
 454    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 455    convert C to a valid multibyte character, convert it based on
 456    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 457
 458 int
 459 unibyte_char_to_multibyte (c)
 460      int c;
 461 {
 462   if (c < 0400 && c >= 0200)
 463     {
 464       int c_save = c;
 465
 466       if (! NILP (Vnonascii_translation_table))
 467         {
 468           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 469           if (c >= 0400 && ! char_valid_p (c, 0))
 470             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 471         }
 472       else if (c >= 0240 && nonascii_insert_offset > 0)
 473         {
 474           c += nonascii_insert_offset;
 475           if (c < 0400 || ! char_valid_p (c, 0))
 476             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 477         }
 478       else if (c >= 0240)
 479         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 480     }
 481   return c;
 482 }
 483
 484
 485 /* Convert the multibyte character C to unibyte 8-bit character based
 486    on Vnonascii_translation_table or nonascii_insert_offset.  If
 487    REV_TBL is non-nil, it should be a reverse table of
 488    Vnonascii_translation_table, i.e. what given by:
 489      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 490
 491 int
 492 multibyte_char_to_unibyte (c, rev_tbl)
 493      int c;
 494      Lisp_Object rev_tbl;
 495 {
 496   if (!SINGLE_BYTE_CHAR_P (c))
 497     {
 498       int c_save = c;
 499
 500       if (! CHAR_TABLE_P (rev_tbl)
 501           && CHAR_TABLE_P (Vnonascii_translation_table))
 502         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 503                                           make_number (0));
 504       if (CHAR_TABLE_P (rev_tbl))
 505         {
 506           Lisp_Object temp;
 507           temp = Faref (rev_tbl, make_number (c));
 508           if (INTEGERP (temp))
 509             c = XINT (temp);
 510           if (c >= 256)
 511             c = (c_save & 0177) + 0200;
 512         }
 513       else
 514         {
 515           if (nonascii_insert_offset > 0)
 516             c -= nonascii_insert_offset;
 517           if (c < 128 || c >= 256)
 518             c = (c_save & 0177) + 0200;
 519         }
 520     }
 521
 522   return c;
 523 }
 524
 525 \f
 526 /* Update the table Vcharset_table with the given arguments (see the
 527    document of `define-charset' for the meaning of each argument).
 528    Several other table contents are also updated.  The caller should
 529    check the validity of CHARSET-ID and the remaining arguments in
 530    advance.  */
 531
 532 void
 533 update_charset_table (charset_id, dimension, chars, width, direction,
 534                       iso_final_char, iso_graphic_plane,
 535                       short_name, long_name, description)
 536      Lisp_Object charset_id, dimension, chars, width, direction;
 537      Lisp_Object iso_final_char, iso_graphic_plane;
 538      Lisp_Object short_name, long_name, description;
 539 {
 540   int charset = XINT (charset_id);
 541   int bytes;
 542   unsigned char leading_code_base, leading_code_ext;
 543
 544   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 545     CHARSET_TABLE_ENTRY (charset)
 546       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 547
 548   /* Get byte length of multibyte form, base leading-code, and
 549      extended leading-code of the charset.  See the comment under the
 550      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 551   bytes = XINT (dimension);
 552   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 553     {
 554       /* Official charset, it doesn't have an extended leading-code.  */
 555       if (charset != CHARSET_ASCII)
 556         bytes += 1; /* For a base leading-code.  */
 557       leading_code_base = charset;
 558       leading_code_ext = 0;
 559     }
 560   else
 561     {
 562       /* Private charset.  */
 563       bytes += 2; /* For base and extended leading-codes.  */
 564       leading_code_base
 565         = (charset < LEADING_CODE_EXT_12
 566            ? LEADING_CODE_PRIVATE_11
 567            : (charset < LEADING_CODE_EXT_21
 568               ? LEADING_CODE_PRIVATE_12
 569               : (charset < LEADING_CODE_EXT_22
 570                  ? LEADING_CODE_PRIVATE_21
 571                  : LEADING_CODE_PRIVATE_22)));
 572       leading_code_ext = charset;
 573     }
 574
 575   if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 576     error ("Invalid dimension for the charset-ID %d", charset);
 577
 578   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 579   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 580   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 581   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 582   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 583   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 584   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 585     = make_number (leading_code_base);
 586   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 587     = make_number (leading_code_ext);
 588   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 589   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 590     = iso_graphic_plane;
 591   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 592   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 593   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 594   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 595
 596   {
 597     /* If we have already defined a charset which has the same
 598        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 599        DIRECTION, we must update the entry REVERSE-CHARSET of both
 600        charsets.  If there's no such charset, the value of the entry
 601        is set to nil.  */
 602     int i;
 603
 604     for (i = 0; i <= MAX_CHARSET; i++)
 605       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 606         {
 607           if (CHARSET_DIMENSION (i) == XINT (dimension)
 608               && CHARSET_CHARS (i) == XINT (chars)
 609               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 610               && CHARSET_DIRECTION (i) != XINT (direction))
 611             {
 612               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 613                 = make_number (i);
 614               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 615               break;
 616             }
 617         }
 618     if (i > MAX_CHARSET)
 619       /* No such a charset.  */
 620       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 621         = make_number (-1);
 622   }
 623
 624   if (charset != CHARSET_ASCII
 625       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 626     {
 627       width_by_char_head[leading_code_base] = XINT (width);
 628
 629       /* Update table emacs_code_class.  */
 630       emacs_code_class[charset] = (bytes == 2
 631                                    ? EMACS_leading_code_2
 632                                    : (bytes == 3
 633                                       ? EMACS_leading_code_3
 634                                       : EMACS_leading_code_4));
 635     }
 636
 637   /* Update table iso_charset_table.  */
 638   if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 639     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 640 }
 641
 642 #ifdef emacs
 643
 644 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 645    is invalid.  */
 646 int
 647 get_charset_id (charset_symbol)
 648      Lisp_Object charset_symbol;
 649 {
 650   Lisp_Object val;
 651   int charset;
 652
 653   return ((SYMBOLP (charset_symbol)
 654            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 655            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 656                CHARSET_VALID_P (charset)))
 657           ? charset : -1);
 658 }
 659
 660 /* Return an identification number for a new private charset of
 661    DIMENSION and WIDTH.  If there's no more room for the new charset,
 662    return 0.  */
 663 Lisp_Object
 664 get_new_private_charset_id (dimension, width)
 665      int dimension, width;
 666 {
 667   int charset, from, to;
 668
 669   if (dimension == 1)
 670     {
 671       if (width == 1)
 672         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 673       else
 674         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 675     }
 676   else
 677     {
 678       if (width == 1)
 679         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 680       else
 681         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 682     }
 683
 684   for (charset = from; charset < to; charset++)
 685     if (!CHARSET_DEFINED_P (charset)) break;
 686
 687   return make_number (charset < to ? charset : 0);
 688 }
 689
 690 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 691   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 692 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 693  treated as a private charset.\n\
 694 INFO-VECTOR is a vector of the format:\n\
 695    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 696     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 697 The meanings of each elements is as follows:\n\
 698 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 699 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 700 WIDTH (integer) is the number of columns a character in the charset\n\
 701 occupies on the screen: one of 0, 1, and 2.\n\
 702 \n\
 703 DIRECTION (integer) is the rendering direction of characters in the\n\
 704 charset when rendering.  If 0, render from left to right, else\n\
 705 render from right to left.\n\
 706 \n\
 707 ISO-FINAL-CHAR (character) is the final character of the\n\
 708 corresponding ISO 2022 charset.\n\
 709 \n\
 710 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 711 while encoding to variants of ISO 2022 coding system, one of the\n\
 712 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 713 \n\
 714 SHORT-NAME (string) is the short name to refer to the charset.\n\
 715 \n\
 716 LONG-NAME (string) is the long name to refer to the charset.\n\
 717 \n\
 718 DESCRIPTION (string) is the description string of the charset.")
 719   (charset_id, charset_symbol, info_vector)
 720      Lisp_Object charset_id, charset_symbol, info_vector;
 721 {
 722   Lisp_Object *vec;
 723
 724   if (!NILP (charset_id))
 725     CHECK_NUMBER (charset_id, 0);
 726   CHECK_SYMBOL (charset_symbol, 1);
 727   CHECK_VECTOR (info_vector, 2);
 728
 729   if (! NILP (charset_id))
 730     {
 731       if (! CHARSET_VALID_P (XINT (charset_id)))
 732         error ("Invalid CHARSET: %d", XINT (charset_id));
 733       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 734         error ("Already defined charset: %d", XINT (charset_id));
 735     }
 736
 737   vec = XVECTOR (info_vector)->contents;
 738   if (XVECTOR (info_vector)->size != 9
 739       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 740       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 741       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 742       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 743       || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 744       || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 745       || !STRINGP (vec[6])
 746       || !STRINGP (vec[7])
 747       || !STRINGP (vec[8]))
 748     error ("Invalid info-vector argument for defining charset %s",
 749            XSYMBOL (charset_symbol)->name->data);
 750
 751   if (NILP (charset_id))
 752     {
 753       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 754       if (XINT (charset_id) == 0)
 755         error ("There's no room for a new private charset %s",
 756                XSYMBOL (charset_symbol)->name->data);
 757     }
 758
 759   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 760                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 761   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 762   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 763   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 764   return Qnil;
 765 }
 766
 767 DEFUN ("generic-character-list", Fgeneric_character_list,
 768        Sgeneric_character_list, 0, 0, 0,
 769   "Return a list of all possible generic characters.\n\
 770 It includes a generic character for a charset not yet defined.")
 771   ()
 772 {
 773   return Vgeneric_character_list;
 774 }
 775
 776 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 777        Sget_unused_iso_final_char, 2, 2, 0,
 778   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 779 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 780 CHARS is the number of characters in a dimension: 94 or 96.\n\
 781 \n\
 782 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 783 If there's no unused final char for the specified kind of charset,\n\
 784 return nil.")
 785   (dimension, chars)
 786      Lisp_Object dimension, chars;
 787 {
 788   int final_char;
 789
 790   CHECK_NUMBER (dimension, 0);
 791   CHECK_NUMBER (chars, 1);
 792   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 793     error ("Invalid charset dimension %d, it should be 1 or 2",
 794            XINT (dimension));
 795   if (XINT (chars) != 94 && XINT (chars) != 96)
 796     error ("Invalid charset chars %d, it should be 94 or 96",
 797            XINT (chars));
 798   for (final_char = '0'; final_char <= '?'; final_char++)
 799     {
 800       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 801         break;
 802     }
 803   return (final_char <= '?' ? make_number (final_char) : Qnil);
 804 }
 805
 806 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 807        4, 4, 0,
 808   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 809 CHARSET should be defined by `defined-charset' in advance.")
 810   (dimension, chars, final_char, charset_symbol)
 811      Lisp_Object dimension, chars, final_char, charset_symbol;
 812 {
 813   int charset;
 814
 815   CHECK_NUMBER (dimension, 0);
 816   CHECK_NUMBER (chars, 1);
 817   CHECK_NUMBER (final_char, 2);
 818   CHECK_SYMBOL (charset_symbol, 3);
 819
 820   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 821     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 822   if (XINT (chars) != 94 && XINT (chars) != 96)
 823     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 824   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 825     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 826   if ((charset = get_charset_id (charset_symbol)) < 0)
 827     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 828
 829   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 830   return Qnil;
 831 }
 832
 833 /* Return number of different charsets in STR of length LEN.  In
 834    addition, for each found charset N, CHARSETS[N] is set 1.  The
 835    caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
 836    It may lookup a translation table TABLE if supplied.
 837
 838    If CMPCHARP is nonzero and some composite character is found,
 839    CHARSETS[128] is also set 1 and the returned number is incremented
 840    by 1.
 841
 842    If MULTIBYTE is zero, do not check multibyte characters, i.e. if
 843    any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
 844    8-bit codes are found CHARSET[1] is set to 1.  */
 845
 846 int
 847 find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
 848      unsigned char *str;
 849      int len, *charsets;
 850      Lisp_Object table;
 851      int cmpcharp;
 852      int multibyte;
 853 {
 854   register int num = 0, c;
 855
 856   if (! multibyte)
 857     {
 858       unsigned char *endp = str + len;
 859       int maskbits = 0;
 860
 861       while (str < endp && maskbits != 3)
 862         maskbits |=  (*str++ < 0x80 ? 1 : 2);
 863       if (maskbits & 1)
 864         {
 865           charsets[0] = 1;
 866           num++;
 867         }
 868       if (maskbits & 2)
 869         {
 870           charsets[1] = 1;
 871           num++;
 872         }
 873       return num;
 874     }
 875
 876   if (! CHAR_TABLE_P (table))
 877     table = Qnil;
 878
 879   while (len > 0)
 880     {
 881       int bytes, charset;
 882       c = *str;
 883
 884       if (c == LEADING_CODE_COMPOSITION)
 885         {
 886           int cmpchar_id = str_cmpchar_id (str, len);
 887           GLYPH *glyph;
 888
 889           if (cmpchar_id >= 0)
 890             {
 891               struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
 892               int i;
 893
 894               for (i = 0; i < cmp_p->glyph_len; i++)
 895                 {
 896                   c = cmp_p->glyph[i];
 897                   if (!NILP (table))
 898                     {
 899                       if ((c = translate_char (table, c, 0, 0, 0)) < 0)
 900                         c = cmp_p->glyph[i];
 901                     }
 902                   if ((charset = CHAR_CHARSET (c)) < 0)
 903                     charset = CHARSET_ASCII;
 904                   if (!charsets[charset])
 905                     {
 906                       charsets[charset] = 1;
 907                       num += 1;
 908                     }
 909                 }
 910               str += cmp_p->len;
 911               len -= cmp_p->len;
 912               if (cmpcharp && !charsets[CHARSET_COMPOSITION])
 913                 {
 914                   charsets[CHARSET_COMPOSITION] = 1;
 915                   num += 1;
 916                 }
 917               continue;
 918             }
 919
 920           charset = 1;          /* This leads to `unknown' charset.  */
 921           bytes = 1;
 922         }
 923       else
 924         {
 925           c = STRING_CHAR_AND_LENGTH (str, len, bytes);
 926           if (! NILP (table))
 927             {
 928               int c1 = translate_char (table, c, 0, 0, 0);
 929               if (c1 >= 0)
 930                 c = c1;
 931             }
 932           charset = CHAR_CHARSET (c);
 933         }
 934
 935       if (!charsets[charset])
 936         {
 937           charsets[charset] = 1;
 938           num += 1;
 939         }
 940       str += bytes;
 941       len -= bytes;
 942     }
 943   return num;
 944 }
 945
 946 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 947        2, 3, 0,
 948   "Return a list of charsets in the region between BEG and END.\n\
 949 BEG and END are buffer positions.\n\
 950 If the region contains any composite character,\n\
 951 `composition' is included in the returned list.\n\
 952 Optional arg TABLE if non-nil is a translation table to look up.\n\
 953 \n\
 954 If the region contains invalid multiybte characters,\n\
 955 `unknown' is included in the returned list.\n\
 956 \n\
 957 If the current buffer is unibyte, the returned list contains\n\
 958 `ascii' if any 7-bit characters are found,\n\
 959 and `unknown' if any 8-bit characters are found.")
 960   (beg, end, table)
 961      Lisp_Object beg, end, table;
 962 {
 963   int charsets[MAX_CHARSET + 1];
 964   int from, from_byte, to, stop, stop_byte, i;
 965   Lisp_Object val;
 966   int undefined;
 967   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 968
 969   validate_region (&beg, &end);
 970   from = XFASTINT (beg);
 971   stop = to = XFASTINT (end);
 972
 973   if (from < GPT && GPT < to)
 974     {
 975       stop = GPT;
 976       stop_byte = GPT_BYTE;
 977     }
 978   else
 979     stop_byte = CHAR_TO_BYTE (stop);
 980
 981   from_byte = CHAR_TO_BYTE (from);
 982
 983   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 984   while (1)
 985     {
 986       find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
 987                            charsets, table, 1, multibyte);
 988       if (stop < to)
 989         {
 990           from = stop, from_byte = stop_byte;
 991           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 992         }
 993       else
 994         break;
 995     }
 996
 997   val = Qnil;
 998   undefined = 0;
 999   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1000     if (charsets[i])
1001       {
1002         if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1003           val = Fcons (CHARSET_SYMBOL (i), val);
1004         else
1005           undefined = 1;
1006       }
1007   if (undefined)
1008     val = Fcons (Qunknown, val);
1009   return val;
1010 }
1011
1012 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1013        1, 2, 0,
1014   "Return a list of charsets in STR.\n\
1015 If the string contains any composite characters,\n\
1016 `composition' is included in the returned list.\n\
1017 Optional arg TABLE if non-nil is a translation table to look up.\n\
1018 \n\
1019 If the region contains invalid multiybte characters,\n\
1020 `unknown' is included in the returned list.\n\
1021 \n\
1022 If STR is unibyte, the returned list contains\n\
1023 `ascii' if any 7-bit characters are found,\n\
1024 and `unknown' if any 8-bit characters are found.")
1025   (str, table)
1026      Lisp_Object str, table;
1027 {
1028   int charsets[MAX_CHARSET + 1];
1029   int i;
1030   Lisp_Object val;
1031   int undefined;
1032   int multibyte;
1033
1034   CHECK_STRING (str, 0);
1035   multibyte = STRING_MULTIBYTE (str);
1036
1037   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
1038   find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
1039                        charsets, table, 1, multibyte);
1040   val = Qnil;
1041   undefined = 0;
1042   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1043     if (charsets[i])
1044       {
1045         if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1046           val = Fcons (CHARSET_SYMBOL (i), val);
1047         else
1048           undefined = 1;
1049       }
1050   if (undefined)
1051     val = Fcons (Qunknown, val);
1052   return val;
1053 }
1054 \f
1055 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
1056   "")
1057   (charset, code1, code2)
1058      Lisp_Object charset, code1, code2;
1059 {
1060   int charset_id, c1, c2;
1061
1062   CHECK_NUMBER (charset, 0);
1063   charset_id = XINT (charset);
1064   if (!CHARSET_DEFINED_P (charset_id))
1065     error ("Invalid charset ID: %d", XINT (charset));
1066
1067   if (NILP (code1))
1068     c1 = 0;
1069   else
1070     {
1071       CHECK_NUMBER (code1, 1);
1072       c1 = XINT (code1);
1073     }
1074   if (NILP (code2))
1075     c2 = 0;
1076   else
1077     {
1078       CHECK_NUMBER (code2, 2);
1079       c2 = XINT (code2);
1080     }
1081
1082   if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1083     error ("Invalid code points: %d %d", c1, c2);
1084   c1 &= 0x7F;
1085   c2 &= 0x7F;
1086   if (c1 == 0
1087       ? c2 != 0
1088       : (c2 == 0
1089          ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
1090          : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
1091     error ("Invalid code points: %d %d", c1, c2);
1092
1093   return make_number (MAKE_CHAR (charset_id, c1, c2));
1094 }
1095
1096 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1097   "Return list of charset and one or two position-codes of CHAR.\n\
1098 If CHAR is invalid as a character code,\n\
1099 return a list of symbol `unknown' and CHAR.")
1100   (ch)
1101      Lisp_Object ch;
1102 {
1103   Lisp_Object val;
1104   int c, charset, c1, c2;
1105
1106   CHECK_NUMBER (ch, 0);
1107   c = XFASTINT (ch);
1108   if (!CHAR_VALID_P (c, 1))
1109     return Fcons (Qunknown, Fcons (ch, Qnil));
1110   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1111   return (c2 >= 0
1112           ? Fcons (CHARSET_SYMBOL (charset),
1113                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1114           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1115 }
1116
1117 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1118   "Return charset of CHAR.")
1119   (ch)
1120      Lisp_Object ch;
1121 {
1122   CHECK_NUMBER (ch, 0);
1123
1124   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1125 }
1126
1127 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1128   "Return charset of a character in the current buffer at position POS.\n\
1129 If POS is nil, it defauls to the current point.\n\
1130 If POS is out of range, the value is nil.")
1131   (pos)
1132      Lisp_Object pos;
1133 {
1134   register int pos_byte, bytes, charset, c1, c2;
1135   register unsigned char *p;
1136
1137   if (NILP (pos))
1138     pos_byte = PT_BYTE;
1139   else if (MARKERP (pos))
1140     {
1141       pos_byte = marker_byte_position (pos);
1142       if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1143         return Qnil;
1144     }
1145   else
1146     {
1147       CHECK_NUMBER (pos, 0);
1148       if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1149         return Qnil;
1150       pos_byte = CHAR_TO_BYTE (XINT (pos));
1151     }
1152   p = BYTE_POS_ADDR (pos_byte);
1153   if (BASE_LEADING_CODE_P (*p))
1154     {
1155       SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1156       if (charset < 0)
1157         charset = 1;
1158     }
1159   else
1160     charset = CHARSET_ASCII;
1161
1162   return CHARSET_SYMBOL (charset);
1163 }
1164
1165 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1166   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1167 \n\
1168 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1169 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1170 where as Emacs distinguishes them by charset symbol.\n\
1171 See the documentation of the function `charset-info' for the meanings of\n\
1172 DIMENSION, CHARS, and FINAL-CHAR.")
1173   (dimension, chars, final_char)
1174      Lisp_Object dimension, chars, final_char;
1175 {
1176   int charset;
1177
1178   CHECK_NUMBER (dimension, 0);
1179   CHECK_NUMBER (chars, 1);
1180   CHECK_NUMBER (final_char, 2);
1181
1182   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1183     return Qnil;
1184   return CHARSET_SYMBOL (charset);
1185 }
1186
1187 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1188    generic character.  If GENERICP is zero, return nonzero iff C is a
1189    valid normal character.  Do not call this function directly,
1190    instead use macro CHAR_VALID_P.  */
1191 int
1192 char_valid_p (c, genericp)
1193      int c, genericp;
1194 {
1195   int charset, c1, c2;
1196
1197   if (c < 0)
1198     return 0;
1199   if (SINGLE_BYTE_CHAR_P (c))
1200     return 1;
1201   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1202   if (charset == CHARSET_COMPOSITION)
1203     return ((c >= MIN_CHAR_COMPOSITION
1204              && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1205             || (genericp && c == GENERIC_COMPOSITION_CHAR));
1206   if (genericp)
1207     {
1208       if (c1)
1209         {
1210           if (c2 <= 0) c2 = 0x20;
1211         }
1212       else
1213         {
1214           if (c2 <= 0) c1 = c2 = 0x20;
1215         }
1216     }
1217   return (CHARSET_DEFINED_P (charset)
1218           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1219 }
1220
1221 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1222   "Return t if OBJECT is a valid normal character.\n\
1223 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1224 a valid generic character.")
1225   (object, genericp)
1226      Lisp_Object object, genericp;
1227 {
1228   if (! NATNUMP (object))
1229     return Qnil;
1230   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1231 }
1232
1233 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1234        Sunibyte_char_to_multibyte, 1, 1, 0,
1235   "Convert the unibyte character CH to multibyte character.\n\
1236 The conversion is done based on `nonascii-translation-table' (which see)\n\
1237  or `nonascii-insert-offset' (which see).")
1238   (ch)
1239      Lisp_Object ch;
1240 {
1241   int c;
1242
1243   CHECK_NUMBER (ch, 0);
1244   c = XINT (ch);
1245   if (c < 0 || c >= 0400)
1246     error ("Invalid unibyte character: %d", c);
1247   c = unibyte_char_to_multibyte (c);
1248   if (c < 0)
1249     error ("Can't convert to multibyte character: %d", XINT (ch));
1250   return make_number (c);
1251 }
1252
1253 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1254        Smultibyte_char_to_unibyte, 1, 1, 0,
1255   "Convert the multibyte character CH to unibyte character.\n\
1256 The conversion is done based on `nonascii-translation-table' (which see)\n\
1257  or `nonascii-insert-offset' (which see).")
1258   (ch)
1259      Lisp_Object ch;
1260 {
1261   int c;
1262
1263   CHECK_NUMBER (ch, 0);
1264   c = XINT (ch);
1265   if (! CHAR_VALID_P (c, 0))
1266     error ("Invalid multibyte character: %d", c);
1267   c = multibyte_char_to_unibyte (c, Qnil);
1268   if (c < 0)
1269     error ("Can't convert to unibyte character: %d", XINT (ch));
1270   return make_number (c);
1271 }
1272
1273 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1274   "Return 1 regardless of the argument CHAR.\n\
1275 This is now an obsolete function.  We keep it just for backward compatibility.")
1276   (ch)
1277      Lisp_Object ch;
1278 {
1279   Lisp_Object val;
1280
1281   CHECK_NUMBER (ch, 0);
1282   return make_number (1);
1283 }
1284
1285 /* Return how many bytes C will occupy in a multibyte buffer.
1286    Don't call this function directly, instead use macro CHAR_BYTES.  */
1287 int
1288 char_bytes (c)
1289      int c;
1290 {
1291   int bytes;
1292
1293   if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1294     return 1;
1295
1296   if (COMPOSITE_CHAR_P (c))
1297     {
1298       unsigned int id = COMPOSITE_CHAR_ID (c);
1299
1300       bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1301     }
1302   else
1303     {
1304       int charset = CHAR_CHARSET (c);
1305
1306       bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1307     }
1308
1309   return bytes;
1310 }
1311
1312 /* Return the width of character of which multi-byte form starts with
1313    C.  The width is measured by how many columns occupied on the
1314    screen when displayed in the current buffer.  */
1315
1316 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1317   (c < 0x20                                                             \
1318    ? (c == '\t'                                                         \
1319       ? XFASTINT (current_buffer->tab_width)                            \
1320       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1321    : (c < 0x7f                                                          \
1322       ? 1                                                               \
1323       : (c == 0x7F                                                      \
1324          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1325          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1326              && BASE_LEADING_CODE_P (c))                                \
1327             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1328             : 4))))
1329
1330 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1331   "Return width of CHAR when displayed in the current buffer.\n\
1332 The width is measured by how many columns it occupies on the screen.")
1333   (ch)
1334        Lisp_Object ch;
1335 {
1336   Lisp_Object val, disp;
1337   int c;
1338   struct Lisp_Char_Table *dp = buffer_display_table ();
1339
1340   CHECK_NUMBER (ch, 0);
1341
1342   c = XINT (ch);
1343
1344   /* Get the way the display table would display it.  */
1345   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1346
1347   if (VECTORP (disp))
1348     XSETINT (val, XVECTOR (disp)->size);
1349   else if (SINGLE_BYTE_CHAR_P (c))
1350     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1351   else if (COMPOSITE_CHAR_P (c))
1352     {
1353       int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
1354       XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
1355     }
1356   else
1357     {
1358       int charset = CHAR_CHARSET (c);
1359
1360       XSETFASTINT (val, CHARSET_WIDTH (charset));
1361     }
1362   return val;
1363 }
1364
1365 /* Return width of string STR of length LEN when displayed in the
1366    current buffer.  The width is measured by how many columns it
1367    occupies on the screen.  */
1368
1369 int
1370 strwidth (str, len)
1371      unsigned char *str;
1372      int len;
1373 {
1374   unsigned char *endp = str + len;
1375   int width = 0;
1376   struct Lisp_Char_Table *dp = buffer_display_table ();
1377
1378   while (str < endp)
1379     {
1380       if (*str == LEADING_CODE_COMPOSITION)
1381         {
1382           int id = str_cmpchar_id (str, endp - str);
1383
1384           if (id < 0)
1385             {
1386               width += 4;
1387               str++;
1388             }
1389           else
1390             {
1391               width += cmpchar_table[id]->width;
1392               str += cmpchar_table[id]->len;
1393             }
1394         }
1395       else
1396         {
1397           Lisp_Object disp;
1398           int thislen;
1399           int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1400
1401           /* Get the way the display table would display it.  */
1402           if (dp)
1403             disp = DISP_CHAR_VECTOR (dp, c);
1404           else
1405             disp = Qnil;
1406
1407           if (VECTORP (disp))
1408             width += XVECTOR (disp)->size;
1409           else
1410             width += ONE_BYTE_CHAR_WIDTH (*str);
1411
1412           str += thislen;
1413         }
1414     }
1415   return width;
1416 }
1417
1418 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1419   "Return width of STRING when displayed in the current buffer.\n\
1420 Width is measured by how many columns it occupies on the screen.\n\
1421 When calculating width of a multibyte character in STRING,\n\
1422 only the base leading-code is considered; the validity of\n\
1423 the following bytes is not checked.")
1424   (str)
1425      Lisp_Object str;
1426 {
1427   Lisp_Object val;
1428
1429   CHECK_STRING (str, 0);
1430   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1431                               STRING_BYTES (XSTRING (str))));
1432   return val;
1433 }
1434
1435 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1436   "Return the direction of CHAR.\n\
1437 The returned value is 0 for left-to-right and 1 for right-to-left.")
1438   (ch)
1439      Lisp_Object ch;
1440 {
1441   int charset;
1442
1443   CHECK_NUMBER (ch, 0);
1444   charset = CHAR_CHARSET (XFASTINT (ch));
1445   if (!CHARSET_DEFINED_P (charset))
1446     invalid_character (XINT (ch));
1447   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1448 }
1449
1450 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1451   "Return number of characters between BEG and END.")
1452   (beg, end)
1453      Lisp_Object beg, end;
1454 {
1455   int from, to;
1456
1457   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1458   CHECK_NUMBER_COERCE_MARKER (end, 1);
1459
1460   from = min (XFASTINT (beg), XFASTINT (end));
1461   to = max (XFASTINT (beg), XFASTINT (end));
1462
1463   return make_number (to - from);
1464 }
1465
1466 /* Return the number of characters in the NBYTES bytes at PTR.
1467    This works by looking at the contents and checking for multibyte sequences.
1468    However, if the current buffer has enable-multibyte-characters = nil,
1469    we treat each byte as a character.  */
1470
1471 int
1472 chars_in_text (ptr, nbytes)
1473      unsigned char *ptr;
1474      int nbytes;
1475 {
1476   /* current_buffer is null at early stages of Emacs initialization.  */
1477   if (current_buffer == 0
1478       || NILP (current_buffer->enable_multibyte_characters))
1479     return nbytes;
1480
1481   return multibyte_chars_in_text (ptr, nbytes);
1482 }
1483
1484 /* Return the number of characters in the NBYTES bytes at PTR.
1485    This works by looking at the contents and checking for multibyte sequences.
1486    It ignores enable-multibyte-characters.  */
1487
1488 int
1489 multibyte_chars_in_text (ptr, nbytes)
1490      unsigned char *ptr;
1491      int nbytes;
1492 {
1493   unsigned char *endp;
1494   int chars, bytes;
1495
1496   endp = ptr + nbytes;
1497   chars = 0;
1498
1499   while (ptr < endp)
1500     {
1501       if (BASE_LEADING_CODE_P (*ptr))
1502         {
1503           PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1504           ptr += bytes;
1505           nbytes -= bytes;
1506         }
1507       else
1508         {
1509           ptr++;
1510           nbytes--;
1511         }
1512       chars++;
1513     }
1514
1515   return chars;
1516 }
1517
1518 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1519   "Concatenate all the argument characters and make the result a string.")
1520   (n, args)
1521      int n;
1522      Lisp_Object *args;
1523 {
1524   int i;
1525   unsigned char *buf
1526     = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1527   unsigned char *p = buf;
1528   Lisp_Object val;
1529
1530   for (i = 0; i < n; i++)
1531     {
1532       int c, len;
1533       unsigned char *str;
1534
1535       if (!INTEGERP (args[i]))
1536         CHECK_NUMBER (args[i], 0);
1537       c = XINT (args[i]);
1538       len = CHAR_STRING (c, p, str);
1539       if (p != str)
1540         /* C is a composite character.  */
1541         bcopy (str, p, len);
1542       p += len;
1543     }
1544
1545   /* Here, we can't use make_string_from_bytes because of byte
1546      combining problem.  */
1547   val = make_string (buf, p - buf);
1548   return val;
1549 }
1550
1551 #endif /* emacs */
1552 \f
1553 /*** Composite characters staffs ***/
1554
1555 /* Each composite character is identified by CMPCHAR-ID which is
1556    assigned when Emacs needs the character code of the composite
1557    character (e.g. when displaying it on the screen).  See the
1558    document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1559    composite character is represented in Emacs.  */
1560
1561 /* If `static' is defined, it means that it is defined to null string. */
1562 #ifndef static
1563 /* The following function is copied from lread.c.  */
1564 static int
1565 hash_string (ptr, len)
1566      unsigned char *ptr;
1567      int len;
1568 {
1569   register unsigned char *p = ptr;
1570   register unsigned char *end = p + len;
1571   register unsigned char c;
1572   register int hash = 0;
1573
1574   while (p != end)
1575     {
1576       c = *p++;
1577       if (c >= 0140) c -= 40;
1578       hash = ((hash<<3) + (hash>>28) + c);
1579     }
1580   return hash & 07777777777;
1581 }
1582 #endif
1583
1584 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1585
1586 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1587
1588 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1589    integer, where the 1st element is the size of the array, the 2nd
1590    element is how many elements are actually used in the array, and
1591    the remaining elements are CMPCHAR-IDs of composite characters of
1592    the same hash value.  */
1593 #define CMPCHAR_HASH_SIZE(table) table[0]
1594 #define CMPCHAR_HASH_USED(table) table[1]
1595 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1596
1597 /* Return CMPCHAR-ID of the composite character in STR of the length
1598    LEN.  If the composite character has not yet been registered,
1599    register it in `cmpchar_table' and assign new CMPCHAR-ID.  This
1600    is the sole function for assigning CMPCHAR-ID.  */
1601 int
1602 str_cmpchar_id (str, len)
1603      const unsigned char *str;
1604      int len;
1605 {
1606   int hash_idx, *hashp;
1607   unsigned char *buf;
1608   int embedded_rule;            /* 1 if composition rule is embedded.  */
1609   int chars;                    /* number of components.  */
1610   int i;
1611   struct cmpchar_info *cmpcharp;
1612
1613   /* The second byte 0xFF means COMPOSITION rule is embedded.  */
1614   embedded_rule = (str[1] == 0xFF);
1615
1616   /* At first, get the actual length of the composite character.  */
1617   {
1618     const unsigned char *p, *endp = str + 1, *lastp = str + len;
1619     int bytes;
1620
1621     while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1622     if (endp - str < 5)
1623       /* Any composite char have at least 5-byte length.  */
1624       return -1;
1625
1626     chars = 0;
1627     p = str + 1;
1628     while (p < endp)
1629       {
1630         if (embedded_rule)
1631           {
1632             p++;
1633             if (p >= endp)
1634               return -1;
1635           }
1636         /* No need of checking if *P is 0xA0 because
1637            BYTES_BY_CHAR_HEAD (0x80) surely returns 2.  */
1638         p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1639         chars++;
1640       }
1641     if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1642       /* Invalid components.  */
1643       return -1;
1644     len = p - str;
1645   }
1646   hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1647   hashp = cmpchar_hash_table[hash_idx];
1648
1649   /* Then, look into the hash table.  */
1650   if (hashp != NULL)
1651     /* Find the correct one among composite characters of the same
1652        hash value.  */
1653     for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1654       {
1655         cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1656         if (len == cmpcharp->len
1657             && ! bcmp (str, cmpcharp->data, len))
1658           return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1659       }
1660
1661   /* We have to register the composite character in cmpchar_table.  */
1662   if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1663     /* No, we have no more room for a new composite character.  */
1664     return -1;
1665
1666   /* Make the entry in hash table.  */
1667   if (hashp == NULL)
1668     {
1669       /* Make a table for 8 composite characters initially.  */
1670       hashp = (cmpchar_hash_table[hash_idx]
1671                = (int *) xmalloc (sizeof (int) * (2 + 8)));
1672       CMPCHAR_HASH_SIZE (hashp) = 10;
1673       CMPCHAR_HASH_USED (hashp) = 2;
1674     }
1675   else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1676     {
1677       CMPCHAR_HASH_SIZE (hashp) += 8;
1678       hashp = (cmpchar_hash_table[hash_idx]
1679                = (int *) xrealloc (hashp,
1680                                    sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1681     }
1682   CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1683   CMPCHAR_HASH_USED (hashp)++;
1684
1685   /* Set information of the composite character in cmpchar_table.  */
1686   if (cmpchar_table_size == 0)
1687     {
1688       /* This is the first composite character to be registered.  */
1689       cmpchar_table_size = 256;
1690       cmpchar_table
1691         = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1692                                             * cmpchar_table_size);
1693     }
1694   else if (cmpchar_table_size <= n_cmpchars)
1695     {
1696       cmpchar_table_size += 256;
1697       cmpchar_table
1698         = (struct cmpchar_info **) xrealloc (cmpchar_table,
1699                                              sizeof (cmpchar_table[0])
1700                                              * cmpchar_table_size);
1701     }
1702
1703   cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1704
1705   cmpcharp->len = len;
1706   cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1707   bcopy (str, cmpcharp->data, len);
1708   cmpcharp->data[len] = 0;
1709   cmpcharp->glyph_len = chars;
1710   cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1711   if (embedded_rule)
1712     {
1713       cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1714       cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1715     }
1716   else
1717     {
1718       cmpcharp->cmp_rule = NULL;
1719       cmpcharp->col_offset = NULL;
1720     }
1721
1722   /* Setup GLYPH data and composition rules (if any) so as not to make
1723      them every time on displaying.  */
1724   {
1725     unsigned char *bufp;
1726     int width;
1727     float leftmost = 0.0, rightmost = 1.0;
1728
1729     if (embedded_rule)
1730       /* At first, col_offset[N] is set to relative to col_offset[0].  */
1731       cmpcharp->col_offset[0] = 0;
1732
1733     for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1734       {
1735         if (embedded_rule)
1736           cmpcharp->cmp_rule[i] = *bufp++;
1737
1738         if (*bufp == 0xA0)      /* This is an ASCII character.  */
1739           {
1740             cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1741             width = 1;
1742             bufp++;
1743           }
1744         else                    /* Multibyte character.  */
1745           {
1746             /* Make `bufp' point normal multi-byte form temporally.  */
1747             *bufp -= 0x20;
1748             cmpcharp->glyph[i]
1749               = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1750             width = WIDTH_BY_CHAR_HEAD (*bufp);
1751             *bufp += 0x20;
1752             bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1753           }
1754
1755         if (embedded_rule && i > 0)
1756           {
1757             /* Reference points (global_ref and new_ref) are
1758                encoded as below:
1759
1760                0--1--2 -- ascent
1761                |     |
1762                |     |
1763                |  4 -+--- center
1764             -- 3     5 -- baseline
1765                |     |
1766                6--7--8 -- descent
1767
1768                Now, we calculate the column offset of the new glyph
1769                from the left edge of the first glyph.  This can avoid
1770                the same calculation everytime displaying this
1771                composite character.  */
1772
1773             /* Reference points of global glyph and new glyph.  */
1774             int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1775             int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1776             /* Column offset relative to the first glyph.  */
1777             float left = (leftmost
1778                           + (global_ref % 3) * (rightmost - leftmost) / 2.0
1779                           - (new_ref % 3) * width / 2.0);
1780
1781             cmpcharp->col_offset[i] = left;
1782             if (left < leftmost)
1783               leftmost = left;
1784             if (left + width > rightmost)
1785               rightmost = left + width;
1786           }
1787         else
1788           {
1789             if (width > rightmost)
1790               rightmost = width;
1791           }
1792       }
1793     if (embedded_rule)
1794       {
1795         /* Now col_offset[N] are relative to the left edge of the
1796            first component.  Make them relative to the left edge of
1797            overall glyph.  */
1798         for (i = 0; i < chars; i++)
1799           cmpcharp->col_offset[i] -= leftmost;
1800         /* Make rightmost holds width of overall glyph.  */
1801         rightmost -= leftmost;
1802       }
1803
1804     cmpcharp->width = rightmost;
1805     if (cmpcharp->width < rightmost)
1806       /* To get a ceiling integer value.  */
1807       cmpcharp->width++;
1808   }
1809
1810   cmpchar_table[n_cmpchars] = cmpcharp;
1811
1812   return n_cmpchars++;
1813 }
1814
1815 /* Return the Nth element of the composite character C.  If NOERROR is
1816    nonzero, return 0 on error condition (C is an invalid composite
1817    charcter, or N is out of range). */
1818 int
1819 cmpchar_component (c, n, noerror)
1820      int c, n, noerror;
1821 {
1822   int id = COMPOSITE_CHAR_ID (c);
1823
1824   if (id < 0 || id >= n_cmpchars)
1825     {
1826       /* C is not a valid composite character.  */
1827       if (noerror) return 0;
1828       error ("Invalid composite character: %d", c)  ;
1829     }
1830   if (n >= cmpchar_table[id]->glyph_len)
1831     {
1832       /* No such component.  */
1833       if (noerror) return 0;
1834       args_out_of_range (make_number (c), make_number (n));
1835     }
1836   /* No face data is stored in glyph code.  */
1837   return ((int) (cmpchar_table[id]->glyph[n]));
1838 }
1839
1840 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1841   "T if CHAR is a composite character.")
1842   (ch)
1843      Lisp_Object ch;
1844 {
1845   CHECK_NUMBER (ch, 0);
1846   return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1847 }
1848
1849 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1850        2, 2, 0,
1851   "Return the Nth component character of composite character CHARACTER.")
1852   (character, n)
1853      Lisp_Object character, n;
1854 {
1855   int id;
1856
1857   CHECK_NUMBER (character, 0);
1858   CHECK_NUMBER (n, 1);
1859
1860   return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
1861 }
1862
1863 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1864        2, 2, 0,
1865   "Return the Nth composition rule of composite character CHARACTER.\n\
1866 The returned rule is for composing the Nth component\n\
1867 on the (N-1)th component.\n\
1868 If CHARACTER should be composed relatively or N is 0, return 255.")
1869   (character, n)
1870      Lisp_Object character, n;
1871 {
1872   int id;
1873
1874   CHECK_NUMBER (character, 0);
1875   CHECK_NUMBER (n, 1);
1876
1877   id = COMPOSITE_CHAR_ID (XINT (character));
1878   if (id < 0 || id >= n_cmpchars)
1879     error ("Invalid composite character: %d", XINT (character));
1880   if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
1881     args_out_of_range (character, n);
1882
1883   return make_number (cmpchar_table[id]->cmp_rule
1884                       ? cmpchar_table[id]->cmp_rule[XINT (n)]
1885                       : 255);
1886 }
1887
1888 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1889        Scmpchar_cmp_rule_p, 1, 1, 0,
1890   "Return non-nil if composite character CHARACTER contains a embedded rule.")
1891   (character)
1892      Lisp_Object character;
1893 {
1894   int id;
1895
1896   CHECK_NUMBER (character, 0);
1897   id = COMPOSITE_CHAR_ID (XINT (character));
1898   if (id < 0 || id >= n_cmpchars)
1899     error ("Invalid composite character: %d", XINT (character));
1900
1901   return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1902 }
1903
1904 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1905        Scmpchar_cmp_count, 1, 1, 0,
1906   "Return number of compoents of composite character CHARACTER.")
1907   (character)
1908      Lisp_Object character;
1909 {
1910   int id;
1911
1912   CHECK_NUMBER (character, 0);
1913   id = COMPOSITE_CHAR_ID (XINT (character));
1914   if (id < 0 || id >= n_cmpchars)
1915     error ("Invalid composite character: %d", XINT (character));
1916
1917   return (make_number (cmpchar_table[id]->glyph_len));
1918 }
1919
1920 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1921        1, 1, 0,
1922   "Return one char string composed from all characters in STRING.")
1923   (str)
1924      Lisp_Object str;
1925 {
1926   unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1927   int len, i;
1928
1929   CHECK_STRING (str, 0);
1930
1931   buf[0] = LEADING_CODE_COMPOSITION;
1932   p = XSTRING (str)->data;
1933   pend = p + STRING_BYTES (XSTRING (str));
1934   i = 1;
1935   while (p < pend)
1936     {
1937       if (*p < 0x20) /* control code */
1938         error ("Invalid component character: %d", *p);
1939       else if (*p < 0x80)       /* ASCII */
1940         {
1941           if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1942             error ("Too long string to be composed: %s", XSTRING (str)->data);
1943           /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1944              code itself.  */
1945           buf[i++] = 0xA0;
1946           buf[i++] = *p++ + 0x80;
1947         }
1948       else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1949         {
1950           /* Already composed.  Eliminate the heading
1951              LEADING_CODE_COMPOSITION, keep the remaining bytes
1952              unchanged.  */
1953           p++;
1954           if (*p == 255)
1955             error ("Can't compose a rule-based composition character");
1956           ptemp = p;
1957           while (! CHAR_HEAD_P (*p)) p++;
1958           if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1959             error ("Can't compose an invalid composition character");
1960           if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1961             error ("Too long string to be composed: %s", XSTRING (str)->data);
1962           bcopy (ptemp, buf + i, p - ptemp);
1963           i += p - ptemp;
1964         }
1965       else                      /* multibyte char */
1966         {
1967           /* Add 0x20 to the base leading-code, keep the remaining
1968              bytes unchanged.  */
1969           int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
1970
1971           if (len <= 1 || ! CHAR_VALID_P (c, 0))
1972             error ("Can't compose an invalid character");
1973           if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1974             error ("Too long string to be composed: %s", XSTRING (str)->data);
1975           bcopy (p, buf + i, len);
1976           buf[i] += 0x20;
1977           p += len, i += len;
1978         }
1979     }
1980
1981   if (i < 5)
1982     /* STR contains only one character, which can't be composed.  */
1983     error ("Too short string to be composed: %s", XSTRING (str)->data);
1984
1985   return make_string_from_bytes (buf, 1, i);
1986 }
1987
1988 \f
1989 int
1990 charset_id_internal (charset_name)
1991      char *charset_name;
1992 {
1993   Lisp_Object val;
1994
1995   val= Fget (intern (charset_name), Qcharset);
1996   if (!VECTORP (val))
1997     error ("Charset %s is not defined", charset_name);
1998
1999   return (XINT (XVECTOR (val)->contents[0]));
2000 }
2001
2002 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
2003        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
2004    ()
2005 {
2006   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
2007   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
2008   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
2009   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
2010   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
2011   charset_big5_1 = charset_id_internal ("chinese-big5-1");
2012   charset_big5_2 = charset_id_internal ("chinese-big5-2");
2013   return Qnil;
2014 }
2015
2016 void
2017 init_charset_once ()
2018 {
2019   int i, j, k;
2020
2021   staticpro (&Vcharset_table);
2022   staticpro (&Vcharset_symbol_table);
2023   staticpro (&Vgeneric_character_list);
2024
2025   /* This has to be done here, before we call Fmake_char_table.  */
2026   Qcharset_table = intern ("charset-table");
2027   staticpro (&Qcharset_table);
2028
2029   /* Intern this now in case it isn't already done.
2030      Setting this variable twice is harmless.
2031      But don't staticpro it here--that is done in alloc.c.  */
2032   Qchar_table_extra_slots = intern ("char-table-extra-slots");
2033
2034   /* Now we are ready to set up this property, so we can
2035      create the charset table.  */
2036   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2037   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2038
2039   Qunknown = intern ("unknown");
2040   staticpro (&Qunknown);
2041   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2042                                         Qunknown);
2043
2044   /* Setup tables.  */
2045   for (i = 0; i < 2; i++)
2046     for (j = 0; j < 2; j++)
2047       for (k = 0; k < 128; k++)
2048         iso_charset_table [i][j][k] = -1;
2049
2050   bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2051   cmpchar_table_size = n_cmpchars = 0;
2052
2053   for (i = 0; i < 256; i++)
2054     BYTES_BY_CHAR_HEAD (i) = 1;
2055   for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2056        i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2057     BYTES_BY_CHAR_HEAD (i) = 2;
2058   for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2059        i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2060     BYTES_BY_CHAR_HEAD (i) = 3;
2061   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2062   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2063   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2064   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
2065   /* The followings don't reflect the actual bytes, but just to tell
2066      that it is a start of a multibyte character.  */
2067   BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
2068   BYTES_BY_CHAR_HEAD (0x9E) = 2;
2069   BYTES_BY_CHAR_HEAD (0x9F) = 2;
2070
2071   for (i = 0; i < 128; i++)
2072     WIDTH_BY_CHAR_HEAD (i) = 1;
2073   for (; i < 256; i++)
2074     WIDTH_BY_CHAR_HEAD (i) = 4;
2075   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2076   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2077   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2078   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
2079
2080   {
2081     Lisp_Object val;
2082
2083     val = Qnil;
2084     for (i = 0x81; i < 0x90; i++)
2085       val = Fcons (make_number ((i - 0x70) << 7), val);
2086     for (; i < 0x9A; i++)
2087       val = Fcons (make_number ((i - 0x8F) << 14), val);
2088     for (i = 0xA0; i < 0xF0; i++)
2089       val = Fcons (make_number ((i - 0x70) << 7), val);
2090     for (; i < 0xFF; i++)
2091       val = Fcons (make_number ((i - 0xE0) << 14), val);
2092     val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2093     Vgeneric_character_list = Fnreverse (val);
2094   }
2095
2096   nonascii_insert_offset = 0;
2097   Vnonascii_translation_table = Qnil;
2098 }
2099
2100 #ifdef emacs
2101
2102 void
2103 syms_of_charset ()
2104 {
2105   Qascii = intern ("ascii");
2106   staticpro (&Qascii);
2107
2108   Qcharset = intern ("charset");
2109   staticpro (&Qcharset);
2110
2111   /* Define ASCII charset now.  */
2112   update_charset_table (make_number (CHARSET_ASCII),
2113                         make_number (1), make_number (94),
2114                         make_number (1),
2115                         make_number (0),
2116                         make_number ('B'),
2117                         make_number (0),
2118                         build_string ("ASCII"),
2119                         build_string ("ASCII"),
2120                         build_string ("ASCII (ISO646 IRV)"));
2121   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2122   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2123
2124   Qcomposition = intern ("composition");
2125   staticpro (&Qcomposition);
2126   CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2127
2128   Qauto_fill_chars = intern ("auto-fill-chars");
2129   staticpro (&Qauto_fill_chars);
2130   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2131
2132   defsubr (&Sdefine_charset);
2133   defsubr (&Sgeneric_character_list);
2134   defsubr (&Sget_unused_iso_final_char);
2135   defsubr (&Sdeclare_equiv_charset);
2136   defsubr (&Sfind_charset_region);
2137   defsubr (&Sfind_charset_string);
2138   defsubr (&Smake_char_internal);
2139   defsubr (&Ssplit_char);
2140   defsubr (&Schar_charset);
2141   defsubr (&Scharset_after);
2142   defsubr (&Siso_charset);
2143   defsubr (&Schar_valid_p);
2144   defsubr (&Sunibyte_char_to_multibyte);
2145   defsubr (&Smultibyte_char_to_unibyte);
2146   defsubr (&Schar_bytes);
2147   defsubr (&Schar_width);
2148   defsubr (&Sstring_width);
2149   defsubr (&Schar_direction);
2150   defsubr (&Schars_in_region);
2151   defsubr (&Sstring);
2152   defsubr (&Scmpcharp);
2153   defsubr (&Scmpchar_component);
2154   defsubr (&Scmpchar_cmp_rule);
2155   defsubr (&Scmpchar_cmp_rule_p);
2156   defsubr (&Scmpchar_cmp_count);
2157   defsubr (&Scompose_string);
2158   defsubr (&Ssetup_special_charsets);
2159
2160   DEFVAR_LISP ("charset-list", &Vcharset_list,
2161     "List of charsets ever defined.");
2162   Vcharset_list = Fcons (Qascii, Qnil);
2163
2164   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
2165     "Vector of cons cell of a symbol and translation table ever defined.\n\
2166 An ID of a translation table is an index of this vector.");
2167   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
2168
2169   DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2170     "Leading-code of composite characters.");
2171   leading_code_composition = LEADING_CODE_COMPOSITION;
2172
2173   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2174     "Leading-code of private TYPE9N charset of column-width 1.");
2175   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2176
2177   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2178     "Leading-code of private TYPE9N charset of column-width 2.");
2179   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2180
2181   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2182     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2183   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2184
2185   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2186     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2187   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
2188
2189   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
2190     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
2191 This is used for converting unibyte text to multibyte,\n\
2192 and for inserting character codes specified by number.\n\n\
2193 This serves to convert a Latin-1 or similar 8-bit character code\n\
2194 to the corresponding Emacs multibyte character code.\n\
2195 Typically the value should be (- (make-char CHARSET 0) 128),\n\
2196 for your choice of character set.\n\
2197 If `nonascii-translation-table' is non-nil, it overrides this variable.");
2198   nonascii_insert_offset = 0;
2199
2200   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
2201     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
2202 This is used for converting unibyte text to multibyte,\n\
2203 and for inserting character codes specified by number.\n\n\
2204 Conversion is performed only when multibyte characters are enabled,\n\
2205 and it serves to convert a Latin-1 or similar 8-bit character code\n\
2206 to the corresponding Emacs character code.\n\n\
2207 If this is nil, `nonascii-insert-offset' is used instead.\n\
2208 See also the docstring of `make-translation-table'.");
2209   Vnonascii_translation_table = Qnil;
2210
2211   DEFVAR_INT ("min-composite-char", &min_composite_char,
2212     "Minimum character code of a composite character.");
2213   min_composite_char = MIN_CHAR_COMPOSITION;
2214
2215   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2216     "A char-table for characters which invoke auto-filling.\n\
2217 Such characters has value t in this table.");
2218   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
2219   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2220   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
2221 }
2222
2223 #endif /* emacs */