code.delx.au - gnu-emacs/blob - src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #include <stdio.h>
  26
  27 #ifdef emacs
  28
  29 #include <sys/types.h>
  30 #include <config.h>
  31 #include "lisp.h"
  32 #include "buffer.h"
  33 #include "charset.h"
  34 #include "coding.h"
  35 #include "disptab.h"
  36
  37 #else  /* not emacs */
  38
  39 #include "mulelib.h"
  40
  41 #endif /* emacs */
  42
  43 Lisp_Object Qcharset, Qascii, Qcomposition;
  44 Lisp_Object Qunknown;
  45
  46 /* Declaration of special leading-codes.  */
  47 int leading_code_composition;   /* for composite characters */
  48 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  49 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  50 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  51 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  52
  53 /* Declaration of special charsets.  */
  54 int charset_ascii;              /* ASCII */
  55 int charset_composition;        /* for a composite character */
  56 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  57 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  58 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  59 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  60 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  61 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  62 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  63
  64 int min_composite_char;
  65
  66 Lisp_Object Qcharset_table;
  67
  68 /* A char-table containing information of each character set.  */
  69 Lisp_Object Vcharset_table;
  70
  71 /* A vector of charset symbol indexed by charset-id.  This is used
  72    only for returning charset symbol from C functions.  */
  73 Lisp_Object Vcharset_symbol_table;
  74
  75 /* A list of charset symbols ever defined.  */
  76 Lisp_Object Vcharset_list;
  77
  78 /* Vector of translation table ever defined.
  79    ID of a translation table is used to index this vector.  */
  80 Lisp_Object Vtranslation_table_vector;
  81
  82 /* A char-table for characters which may invoke auto-filling.  */
  83 Lisp_Object Vauto_fill_chars;
  84
  85 Lisp_Object Qauto_fill_chars;
  86
  87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  88 int bytes_by_char_head[256];
  89 int width_by_char_head[256];
  90
  91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  92    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  93 int iso_charset_table[2][2][128];
  94
  95 /* Table of pointers to the structure `cmpchar_info' indexed by
  96    CMPCHAR-ID.  */
  97 struct cmpchar_info **cmpchar_table;
  98 /* The current size of `cmpchar_table'.  */
  99 static int cmpchar_table_size;
 100 /* Number of the current composite characters.  */
 101 int n_cmpchars;
 102
 103 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
 104 unsigned char *_fetch_multibyte_char_p;
 105 int _fetch_multibyte_char_len;
 106
 107 /* Offset to add to a non-ASCII value when inserting it.  */
 108 int nonascii_insert_offset;
 109
 110 /* Translation table for converting non-ASCII unibyte characters
 111    to multibyte codes, or nil.  */
 112 Lisp_Object Vnonascii_translation_table;
 113
 114 /* List of all possible generic characters.  */
 115 Lisp_Object Vgeneric_character_list;
 116
 117 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 118 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 119 \f
 120 void
 121 invalid_character (c)
 122      int c;
 123 {
 124   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 125 }
 126
 127 /* Parse composite character string STR of length LENGTH (>= 2) and
 128    set BYTES, CHARSET, C1, and C2 as below.
 129
 130    It is assumed that *STR is LEADING_CODE_COMPOSITION and the
 131    following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
 132
 133    If there is a valid composite character, set CHARSET, C1, and C2 to
 134    such values that MAKE_CHAR can make the composite character from
 135    them.  Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
 136    second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
 137    the invalid multibyte character whose string representation is two
 138    bytes of STR[0] and STR[1].  In any case, set BYTES to LENGTH.
 139
 140    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 141
 142 #define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2)        \
 143   do {                                                                  \
 144     int cmpchar_id = str_cmpchar_id ((str), (length));                  \
 145                                                                         \
 146     (charset) = CHARSET_COMPOSITION;                                    \
 147     (bytes) = (length);                                                 \
 148     if (cmpchar_id >= 0)                                                \
 149       {                                                                 \
 150         (c1) = CHAR_FIELD2 (cmpchar_id);                                \
 151         (c2) = CHAR_FIELD3 (cmpchar_id);                                \
 152       }                                                                 \
 153     else                                                                \
 154       {                                                                 \
 155         (c1) = (str)[1] & 0x7F;                                         \
 156         (c2) = -1;                                                      \
 157       }                                                                 \
 158   } while (0)
 159
 160 /* Parse non-composite multibyte character string STR of length LENGTH
 161    (>= 2) and set BYTES to the length of actual multibyte sequence,
 162    CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
 163    multibyte character from them.
 164
 165    It is assumed that *STR is one of base leading codes (excluding
 166    LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
 167    satisfy !CHAR_HEAD_P.
 168
 169    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 170
 171 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2)        \
 172   do {                                                                  \
 173     (bytes) = 1;                                                        \
 174     (charset) = (str)[0];                                               \
 175     if ((charset) >= LEADING_CODE_PRIVATE_11                            \
 176         && (charset) <= LEADING_CODE_PRIVATE_22)                        \
 177       (charset) = (str)[(bytes)++];                                     \
 178     if ((bytes) < (length))                                             \
 179       {                                                                 \
 180         (c1) = (str)[(bytes)++] & 0x7F;                                 \
 181         if ((bytes) < (length))                                         \
 182           (c2) = (str)[(bytes)++] & 0x7F;                               \
 183         else                                                            \
 184           (c2) = -1;                                                    \
 185       }                                                                 \
 186     else                                                                \
 187       (c1) = (c2) = -1;                                                 \
 188   } while (0)
 189
 190 /* Parse string STR of length LENGTH and check if a multibyte
 191    characters is at STR.  set BYTES to the actual length, CHARSET, C1,
 192    C2 to proper values for that character.  */
 193
 194 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)        \
 195   do {                                                                  \
 196     int i;                                                              \
 197     if (ASCII_BYTE_P ((str)[0]))                                        \
 198       i = 1;                                                            \
 199     else                                                                \
 200       for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++);       \
 201     if (i == 1)                                                         \
 202       (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ;         \
 203     else if ((str)[0] == LEADING_CODE_COMPOSITION)                      \
 204       SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2);             \
 205     else                                                                \
 206       {                                                                 \
 207         if (i > BYTES_BY_CHAR_HEAD ((str)[0]))                          \
 208           i = BYTES_BY_CHAR_HEAD ((str)[0]);                            \
 209         SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2);           \
 210       }                                                                 \
 211   } while (0)
 212
 213 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 214 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 215   (CHARSET_DIMENSION (charset) == 1             \
 216    ? ((c1) >= 0x20 && (c1) <= 0x7F)             \
 217    : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))
 218
 219 /* Set STR a pointer to the multi-byte form of the character C.  If C
 220    is not a composite character, the multi-byte form is set in WORKBUF
 221    and STR points WORKBUF.  The caller should allocate at least 4-byte
 222    area at WORKBUF in advance.  Returns the length of the multi-byte
 223    form.  If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
 224    and return 1.
 225
 226    Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
 227    function directly if C can be an ASCII character.  */
 228
 229 int
 230 non_ascii_char_to_string (c, workbuf, str)
 231      int c;
 232      unsigned char *workbuf, **str;
 233 {
 234   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 235     {
 236       /* Multibyte character can't have a modifier bit.  */
 237       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 238         invalid_character (c);
 239
 240       /* For Meta, Shift, and Control modifiers, we need special care.  */
 241       if (c & CHAR_META)
 242         {
 243           /* Move the meta bit to the right place for a string.  */
 244           c = (c & ~CHAR_META) | 0x80;
 245         }
 246       if (c & CHAR_SHIFT)
 247         {
 248           /* Shift modifier is valid only with [A-Za-z].  */
 249           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 250             c &= ~CHAR_SHIFT;
 251           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 252             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 253         }
 254       if (c & CHAR_CTL)
 255         {
 256           /* Simulate the code in lread.c.  */
 257           /* Allow `\C- ' and `\C-?'.  */
 258           if (c == (CHAR_CTL | ' '))
 259             c = 0;
 260           else if (c == (CHAR_CTL | '?'))
 261             c = 127;
 262           /* ASCII control chars are made from letters (both cases),
 263              as well as the non-letters within 0100...0137.  */
 264           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 265             c &= (037 | (~0177 & ~CHAR_CTL));
 266           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 267             c &= (037 | (~0177 & ~CHAR_CTL));
 268         }
 269
 270       /* If C still has any modifier bits, it is an invalid character.  */
 271       if (c & CHAR_MODIFIER_MASK)
 272         invalid_character (c);
 273
 274       *str = workbuf;
 275       *workbuf++ = c;
 276     }
 277   else
 278     {
 279       int charset, c1, c2;
 280
 281       SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 282       if (charset == CHARSET_COMPOSITION)
 283         {
 284           if (c >= MAX_CHAR)
 285             invalid_character (c);
 286           if (c >= MIN_CHAR_COMPOSITION)
 287             {
 288               /* Valid composite character.  */
 289               *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
 290               workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
 291             }
 292           else
 293             {
 294               /* Invalid but can have multibyte form.  */
 295               *str = workbuf;
 296               *workbuf++ = LEADING_CODE_COMPOSITION;
 297               *workbuf++ = c1 | 0x80;
 298             }
 299         }
 300       else if (charset > CHARSET_COMPOSITION)
 301         {
 302           *str = workbuf;
 303           if (charset >= LEADING_CODE_EXT_11)
 304             *workbuf++ = (charset < LEADING_CODE_EXT_12
 305                           ? LEADING_CODE_PRIVATE_11
 306                           : (charset < LEADING_CODE_EXT_21
 307                              ? LEADING_CODE_PRIVATE_12
 308                              : (charset < LEADING_CODE_EXT_22
 309                                 ? LEADING_CODE_PRIVATE_21
 310                                 : LEADING_CODE_PRIVATE_22)));
 311           *workbuf++ = charset;
 312           if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 313             invalid_character (c);
 314           if (c1)
 315             {
 316               *workbuf++ = c1 | 0x80;
 317               if (c2 > 0)
 318                 *workbuf++ = c2 | 0x80;
 319             }
 320         }
 321       else if (charset == CHARSET_ASCII)
 322         *workbuf++= c & 0x7F;
 323       else
 324         invalid_character (c);
 325     }
 326
 327   return (workbuf - *str);
 328 }
 329
 330 /* Return the non-ASCII character corresponding to multi-byte form at
 331    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 332    length of the multibyte form in *ACTUAL_LEN.
 333
 334    Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
 335    directly if you want ot handle ASCII characters as well.  */
 336
 337 int
 338 string_to_non_ascii_char (str, len, actual_len)
 339      const unsigned char *str;
 340      int len, *actual_len;
 341 {
 342   int c, bytes, charset, c1, c2;
 343
 344   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 345   c = MAKE_CHAR (charset, c1, c2);
 346   if (actual_len)
 347     *actual_len = bytes;
 348   return c;
 349 }
 350
 351 /* Return the length of the multi-byte form at string STR of length LEN.
 352    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 353 int
 354 multibyte_form_length (str, len)
 355      const unsigned char *str;
 356      int len;
 357 {
 358   int bytes;
 359
 360   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 361   return bytes;
 362 }
 363
 364 /* Check multibyte form at string STR of length LEN and set variables
 365    pointed by CHARSET, C1, and C2 to charset and position codes of the
 366    character at STR, and return 0.  If there's no multibyte character,
 367    return -1.  This should be used only in the macro SPLIT_STRING
 368    which checks range of STR in advance.  */
 369
 370 int
 371 split_non_ascii_string (str, len, charset, c1, c2)
 372      const unsigned char *str;
 373      unsigned char *c1, *c2;
 374      int len, *charset;
 375 {
 376   register int bytes, cs, code1, code2 = -1;
 377
 378   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 379   if (cs == CHARSET_ASCII)
 380     return -1;
 381   *charset = cs;
 382   *c1 = code1;
 383   *c2 = code2;
 384 }
 385
 386 /* Return 1 iff character C has valid printable glyph.
 387    Use the macro CHAR_PRINTABLE_P instead.  */
 388 int
 389 char_printable_p (c)
 390      int c;
 391 {
 392   int charset, c1, c2, chars;
 393
 394   if (SINGLE_BYTE_CHAR_P (c))
 395     return 1;
 396   if (c >= MIN_CHAR_COMPOSITION)
 397     return (c < MAX_CHAR);
 398
 399   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 400   if (! CHARSET_DEFINED_P (charset))
 401     return 0;
 402   if (CHARSET_CHARS (charset) == 94
 403       ? c1 <= 32 || c1 >= 127
 404       : c1 < 32)
 405     return 0;
 406   if (CHARSET_DIMENSION (charset) == 2
 407       && (CHARSET_CHARS (charset) == 94
 408           ? c2 <= 32 || c2 >= 127
 409           : c2 < 32))
 410     return 0;
 411   return 1;
 412 }
 413
 414 /* Translate character C by translation table TABLE.  If C
 415    is negative, translate a character specified by CHARSET, C1, and C2
 416    (C1 and C2 are code points of the character).  If no translation is
 417    found in TABLE, return C.  */
 418 int
 419 translate_char (table, c, charset, c1, c2)
 420      Lisp_Object table;
 421      int c, charset, c1, c2;
 422 {
 423   Lisp_Object ch;
 424   int alt_charset, alt_c1, alt_c2, dimension;
 425
 426   if (c < 0) c = MAKE_CHAR (charset, c1, c2);
 427   if (!CHAR_TABLE_P (table)
 428       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 429     return c;
 430
 431   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 432   dimension = CHARSET_DIMENSION (alt_charset);
 433   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 434     /* CH is not a generic character, just return it.  */
 435     return XFASTINT (ch);
 436
 437   /* Since CH is a generic character, we must return a specific
 438      charater which has the same position codes as C from CH.  */
 439   if (charset < 0)
 440     SPLIT_CHAR (c, charset, c1, c2);
 441   if (dimension != CHARSET_DIMENSION (charset))
 442     /* We can't make such a character because of dimension mismatch.  */
 443     return c;
 444   return MAKE_CHAR (alt_charset, c1, c2);
 445 }
 446
 447 /* Convert the unibyte character C to multibyte based on
 448    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 449    convert C to a valid multibyte character, convert it based on
 450    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 451
 452 int
 453 unibyte_char_to_multibyte (c)
 454      int c;
 455 {
 456   if (c < 0400 && c >= 0200)
 457     {
 458       int c_save = c;
 459
 460       if (! NILP (Vnonascii_translation_table))
 461         {
 462           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 463           if (c >= 0400 && ! char_valid_p (c, 0))
 464             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 465         }
 466       else if (c >= 0240 && nonascii_insert_offset > 0)
 467         {
 468           c += nonascii_insert_offset;
 469           if (c < 0400 || ! char_valid_p (c, 0))
 470             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 471         }
 472       else if (c >= 0240)
 473         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 474     }
 475   return c;
 476 }
 477
 478
 479 /* Convert the multibyte character C to unibyte 8-bit character based
 480    on Vnonascii_translation_table or nonascii_insert_offset.  If
 481    REV_TBL is non-nil, it should be a reverse table of
 482    Vnonascii_translation_table, i.e. what given by:
 483      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 484
 485 int
 486 multibyte_char_to_unibyte (c, rev_tbl)
 487      int c;
 488      Lisp_Object rev_tbl;
 489 {
 490   if (!SINGLE_BYTE_CHAR_P (c))
 491     {
 492       int c_save = c;
 493
 494       if (! CHAR_TABLE_P (rev_tbl)
 495           && CHAR_TABLE_P (Vnonascii_translation_table))
 496         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 497                                           make_number (0));
 498       if (CHAR_TABLE_P (rev_tbl))
 499         {
 500           Lisp_Object temp;
 501           temp = Faref (rev_tbl, make_number (c));
 502           if (INTEGERP (temp))
 503             c = XINT (temp);
 504           if (c >= 256)
 505             c = (c_save & 0177) + 0200;
 506         }
 507       else
 508         {
 509           if (nonascii_insert_offset > 0)
 510             c -= nonascii_insert_offset;
 511           if (c < 128 || c >= 256)
 512             c = (c_save & 0177) + 0200;
 513         }
 514     }
 515
 516   return c;
 517 }
 518
 519 \f
 520 /* Update the table Vcharset_table with the given arguments (see the
 521    document of `define-charset' for the meaning of each argument).
 522    Several other table contents are also updated.  The caller should
 523    check the validity of CHARSET-ID and the remaining arguments in
 524    advance.  */
 525
 526 void
 527 update_charset_table (charset_id, dimension, chars, width, direction,
 528                       iso_final_char, iso_graphic_plane,
 529                       short_name, long_name, description)
 530      Lisp_Object charset_id, dimension, chars, width, direction;
 531      Lisp_Object iso_final_char, iso_graphic_plane;
 532      Lisp_Object short_name, long_name, description;
 533 {
 534   int charset = XINT (charset_id);
 535   int bytes;
 536   unsigned char leading_code_base, leading_code_ext;
 537
 538   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 539     CHARSET_TABLE_ENTRY (charset)
 540       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 541
 542   /* Get byte length of multibyte form, base leading-code, and
 543      extended leading-code of the charset.  See the comment under the
 544      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 545   bytes = XINT (dimension);
 546   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 547     {
 548       /* Official charset, it doesn't have an extended leading-code.  */
 549       if (charset != CHARSET_ASCII)
 550         bytes += 1; /* For a base leading-code.  */
 551       leading_code_base = charset;
 552       leading_code_ext = 0;
 553     }
 554   else
 555     {
 556       /* Private charset.  */
 557       bytes += 2; /* For base and extended leading-codes.  */
 558       leading_code_base
 559         = (charset < LEADING_CODE_EXT_12
 560            ? LEADING_CODE_PRIVATE_11
 561            : (charset < LEADING_CODE_EXT_21
 562               ? LEADING_CODE_PRIVATE_12
 563               : (charset < LEADING_CODE_EXT_22
 564                  ? LEADING_CODE_PRIVATE_21
 565                  : LEADING_CODE_PRIVATE_22)));
 566       leading_code_ext = charset;
 567     }
 568
 569   if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 570     error ("Invalid dimension for the charset-ID %d", charset);
 571
 572   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 573   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 574   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 575   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 576   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 577   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 578   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 579     = make_number (leading_code_base);
 580   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 581     = make_number (leading_code_ext);
 582   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 583   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 584     = iso_graphic_plane;
 585   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 586   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 587   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 588   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 589
 590   {
 591     /* If we have already defined a charset which has the same
 592        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 593        DIRECTION, we must update the entry REVERSE-CHARSET of both
 594        charsets.  If there's no such charset, the value of the entry
 595        is set to nil.  */
 596     int i;
 597
 598     for (i = 0; i <= MAX_CHARSET; i++)
 599       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 600         {
 601           if (CHARSET_DIMENSION (i) == XINT (dimension)
 602               && CHARSET_CHARS (i) == XINT (chars)
 603               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 604               && CHARSET_DIRECTION (i) != XINT (direction))
 605             {
 606               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 607                 = make_number (i);
 608               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 609               break;
 610             }
 611         }
 612     if (i > MAX_CHARSET)
 613       /* No such a charset.  */
 614       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 615         = make_number (-1);
 616   }
 617
 618   if (charset != CHARSET_ASCII
 619       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 620     {
 621       width_by_char_head[leading_code_base] = XINT (width);
 622
 623       /* Update table emacs_code_class.  */
 624       emacs_code_class[charset] = (bytes == 2
 625                                    ? EMACS_leading_code_2
 626                                    : (bytes == 3
 627                                       ? EMACS_leading_code_3
 628                                       : EMACS_leading_code_4));
 629     }
 630
 631   /* Update table iso_charset_table.  */
 632   if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 633     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 634 }
 635
 636 #ifdef emacs
 637
 638 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 639    is invalid.  */
 640 int
 641 get_charset_id (charset_symbol)
 642      Lisp_Object charset_symbol;
 643 {
 644   Lisp_Object val;
 645   int charset;
 646
 647   return ((SYMBOLP (charset_symbol)
 648            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 649            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 650                CHARSET_VALID_P (charset)))
 651           ? charset : -1);
 652 }
 653
 654 /* Return an identification number for a new private charset of
 655    DIMENSION and WIDTH.  If there's no more room for the new charset,
 656    return 0.  */
 657 Lisp_Object
 658 get_new_private_charset_id (dimension, width)
 659      int dimension, width;
 660 {
 661   int charset, from, to;
 662
 663   if (dimension == 1)
 664     {
 665       if (width == 1)
 666         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 667       else
 668         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 669     }
 670   else
 671     {
 672       if (width == 1)
 673         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 674       else
 675         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 676     }
 677
 678   for (charset = from; charset < to; charset++)
 679     if (!CHARSET_DEFINED_P (charset)) break;
 680
 681   return make_number (charset < to ? charset : 0);
 682 }
 683
 684 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 685   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 686 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 687  treated as a private charset.\n\
 688 INFO-VECTOR is a vector of the format:\n\
 689    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 690     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 691 The meanings of each elements is as follows:\n\
 692 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 693 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 694 WIDTH (integer) is the number of columns a character in the charset\n\
 695 occupies on the screen: one of 0, 1, and 2.\n\
 696 \n\
 697 DIRECTION (integer) is the rendering direction of characters in the\n\
 698 charset when rendering.  If 0, render from left to right, else\n\
 699 render from right to left.\n\
 700 \n\
 701 ISO-FINAL-CHAR (character) is the final character of the\n\
 702 corresponding ISO 2022 charset.\n\
 703 \n\
 704 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 705 while encoding to variants of ISO 2022 coding system, one of the\n\
 706 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 707 \n\
 708 SHORT-NAME (string) is the short name to refer to the charset.\n\
 709 \n\
 710 LONG-NAME (string) is the long name to refer to the charset.\n\
 711 \n\
 712 DESCRIPTION (string) is the description string of the charset.")
 713   (charset_id, charset_symbol, info_vector)
 714      Lisp_Object charset_id, charset_symbol, info_vector;
 715 {
 716   Lisp_Object *vec;
 717
 718   if (!NILP (charset_id))
 719     CHECK_NUMBER (charset_id, 0);
 720   CHECK_SYMBOL (charset_symbol, 1);
 721   CHECK_VECTOR (info_vector, 2);
 722
 723   if (! NILP (charset_id))
 724     {
 725       if (! CHARSET_VALID_P (XINT (charset_id)))
 726         error ("Invalid CHARSET: %d", XINT (charset_id));
 727       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 728         error ("Already defined charset: %d", XINT (charset_id));
 729     }
 730
 731   vec = XVECTOR (info_vector)->contents;
 732   if (XVECTOR (info_vector)->size != 9
 733       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 734       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 735       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 736       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 737       || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 738       || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 739       || !STRINGP (vec[6])
 740       || !STRINGP (vec[7])
 741       || !STRINGP (vec[8]))
 742     error ("Invalid info-vector argument for defining charset %s",
 743            XSYMBOL (charset_symbol)->name->data);
 744
 745   if (NILP (charset_id))
 746     {
 747       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 748       if (XINT (charset_id) == 0)
 749         error ("There's no room for a new private charset %s",
 750                XSYMBOL (charset_symbol)->name->data);
 751     }
 752
 753   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 754                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 755   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 756   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 757   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 758   return Qnil;
 759 }
 760
 761 DEFUN ("generic-character-list", Fgeneric_character_list,
 762        Sgeneric_character_list, 0, 0, 0,
 763   "Return a list of all possible generic characters.\n\
 764 It includes a generic character for a charset not yet defined.")
 765   ()
 766 {
 767   return Vgeneric_character_list;
 768 }
 769
 770 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 771        Sget_unused_iso_final_char, 2, 2, 0,
 772   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 773 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 774 CHARS is the number of characters in a dimension: 94 or 96.\n\
 775 \n\
 776 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 777 If there's no unused final char for the specified kind of charset,\n\
 778 return nil.")
 779   (dimension, chars)
 780      Lisp_Object dimension, chars;
 781 {
 782   int final_char;
 783
 784   CHECK_NUMBER (dimension, 0);
 785   CHECK_NUMBER (chars, 1);
 786   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 787     error ("Invalid charset dimension %d, it should be 1 or 2",
 788            XINT (dimension));
 789   if (XINT (chars) != 94 && XINT (chars) != 96)
 790     error ("Invalid charset chars %d, it should be 94 or 96",
 791            XINT (chars));
 792   for (final_char = '0'; final_char <= '?'; final_char++)
 793     {
 794       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 795         break;
 796     }
 797   return (final_char <= '?' ? make_number (final_char) : Qnil);
 798 }
 799
 800 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 801        4, 4, 0,
 802   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 803 CHARSET should be defined by `defined-charset' in advance.")
 804   (dimension, chars, final_char, charset_symbol)
 805      Lisp_Object dimension, chars, final_char, charset_symbol;
 806 {
 807   int charset;
 808
 809   CHECK_NUMBER (dimension, 0);
 810   CHECK_NUMBER (chars, 1);
 811   CHECK_NUMBER (final_char, 2);
 812   CHECK_SYMBOL (charset_symbol, 3);
 813
 814   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 815     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 816   if (XINT (chars) != 94 && XINT (chars) != 96)
 817     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 818   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 819     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 820   if ((charset = get_charset_id (charset_symbol)) < 0)
 821     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 822
 823   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 824   return Qnil;
 825 }
 826
 827 /* Return number of different charsets in STR of length LEN.  In
 828    addition, for each found charset N, CHARSETS[N] is set 1.  The
 829    caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
 830    It may lookup a translation table TABLE if supplied.
 831
 832    If CMPCHARP is nonzero and some composite character is found,
 833    CHARSETS[128] is also set 1 and the returned number is incremented
 834    by 1.
 835
 836    If MULTIBYTE is zero, do not check multibyte characters, i.e. if
 837    any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
 838    8-bit codes are found CHARSET[1] is set to 1.  */
 839
 840 int
 841 find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
 842      unsigned char *str;
 843      int len, *charsets;
 844      Lisp_Object table;
 845      int cmpcharp;
 846      int multibyte;
 847 {
 848   register int num = 0, c;
 849
 850   if (! multibyte)
 851     {
 852       unsigned char *endp = str + len;
 853       int maskbits = 0;
 854
 855       while (str < endp && maskbits != 3)
 856         maskbits |=  (*str++ < 0x80 ? 1 : 2);
 857       if (maskbits & 1)
 858         {
 859           charsets[0] = 1;
 860           num++;
 861         }
 862       if (maskbits & 2)
 863         {
 864           charsets[1] = 1;
 865           num++;
 866         }
 867       return num;
 868     }
 869
 870   if (! CHAR_TABLE_P (table))
 871     table = Qnil;
 872
 873   while (len > 0)
 874     {
 875       int bytes, charset;
 876       c = *str;
 877
 878       if (c == LEADING_CODE_COMPOSITION)
 879         {
 880           int cmpchar_id = str_cmpchar_id (str, len);
 881           GLYPH *glyph;
 882
 883           if (cmpchar_id >= 0)
 884             {
 885               struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
 886               int i;
 887
 888               for (i = 0; i < cmp_p->glyph_len; i++)
 889                 {
 890                   c = cmp_p->glyph[i];
 891                   if (!NILP (table))
 892                     {
 893                       if ((c = translate_char (table, c, 0, 0, 0)) < 0)
 894                         c = cmp_p->glyph[i];
 895                     }
 896                   if ((charset = CHAR_CHARSET (c)) < 0)
 897                     charset = CHARSET_ASCII;
 898                   if (!charsets[charset])
 899                     {
 900                       charsets[charset] = 1;
 901                       num += 1;
 902                     }
 903                 }
 904               str += cmp_p->len;
 905               len -= cmp_p->len;
 906               if (cmpcharp && !charsets[CHARSET_COMPOSITION])
 907                 {
 908                   charsets[CHARSET_COMPOSITION] = 1;
 909                   num += 1;
 910                 }
 911               continue;
 912             }
 913
 914           charset = 1;          /* This leads to `unknown' charset.  */
 915           bytes = 1;
 916         }
 917       else
 918         {
 919           c = STRING_CHAR_AND_LENGTH (str, len, bytes);
 920           if (! NILP (table))
 921             {
 922               int c1 = translate_char (table, c, 0, 0, 0);
 923               if (c1 >= 0)
 924                 c = c1;
 925             }
 926           charset = CHAR_CHARSET (c);
 927         }
 928
 929       if (!charsets[charset])
 930         {
 931           charsets[charset] = 1;
 932           num += 1;
 933         }
 934       str += bytes;
 935       len -= bytes;
 936     }
 937   return num;
 938 }
 939
 940 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 941        2, 3, 0,
 942   "Return a list of charsets in the region between BEG and END.\n\
 943 BEG and END are buffer positions.\n\
 944 If the region contains any composite character,\n\
 945 `composition' is included in the returned list.\n\
 946 Optional arg TABLE if non-nil is a translation table to look up.\n\
 947 \n\
 948 If the region contains invalid multiybte characters,\n\
 949 `unknown' is included in the returned list.\n\
 950 \n\
 951 If the current buffer is unibyte, the returned list contains\n\
 952 `ascii' if any 7-bit characters are found,\n\
 953 and `unknown' if any 8-bit characters are found.")
 954   (beg, end, table)
 955      Lisp_Object beg, end, table;
 956 {
 957   int charsets[MAX_CHARSET + 1];
 958   int from, from_byte, to, stop, stop_byte, i;
 959   Lisp_Object val;
 960   int undefined;
 961   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 962
 963   validate_region (&beg, &end);
 964   from = XFASTINT (beg);
 965   stop = to = XFASTINT (end);
 966
 967   if (from < GPT && GPT < to)
 968     {
 969       stop = GPT;
 970       stop_byte = GPT_BYTE;
 971     }
 972   else
 973     stop_byte = CHAR_TO_BYTE (stop);
 974
 975   from_byte = CHAR_TO_BYTE (from);
 976
 977   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 978   while (1)
 979     {
 980       find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
 981                            charsets, table, 1, multibyte);
 982       if (stop < to)
 983         {
 984           from = stop, from_byte = stop_byte;
 985           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 986         }
 987       else
 988         break;
 989     }
 990
 991   val = Qnil;
 992   undefined = 0;
 993   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
 994     if (charsets[i])
 995       {
 996         if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
 997           val = Fcons (CHARSET_SYMBOL (i), val);
 998         else
 999           undefined = 1;
1000       }
1001   if (undefined)
1002     val = Fcons (Qunknown, val);
1003   return val;
1004 }
1005
1006 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1007        1, 2, 0,
1008   "Return a list of charsets in STR.\n\
1009 If the string contains any composite characters,\n\
1010 `composition' is included in the returned list.\n\
1011 Optional arg TABLE if non-nil is a translation table to look up.\n\
1012 \n\
1013 If the region contains invalid multiybte characters,\n\
1014 `unknown' is included in the returned list.\n\
1015 \n\
1016 If STR is unibyte, the returned list contains\n\
1017 `ascii' if any 7-bit characters are found,\n\
1018 and `unknown' if any 8-bit characters are found.")
1019   (str, table)
1020      Lisp_Object str, table;
1021 {
1022   int charsets[MAX_CHARSET + 1];
1023   int i;
1024   Lisp_Object val;
1025   int undefined;
1026   int multibyte;
1027
1028   CHECK_STRING (str, 0);
1029   multibyte = STRING_MULTIBYTE (str);
1030
1031   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
1032   find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
1033                        charsets, table, 1, multibyte);
1034   val = Qnil;
1035   undefined = 0;
1036   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1037     if (charsets[i])
1038       {
1039         if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1040           val = Fcons (CHARSET_SYMBOL (i), val);
1041         else
1042           undefined = 1;
1043       }
1044   if (undefined)
1045     val = Fcons (Qunknown, val);
1046   return val;
1047 }
1048 \f
1049 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
1050   "")
1051   (charset, code1, code2)
1052      Lisp_Object charset, code1, code2;
1053 {
1054   int charset_id, c1, c2;
1055
1056   CHECK_NUMBER (charset, 0);
1057   charset_id = XINT (charset);
1058   if (!CHARSET_DEFINED_P (charset_id))
1059     error ("Invalid charset ID: %d", XINT (charset));
1060
1061   if (NILP (code1))
1062     c1 = 0;
1063   else
1064     {
1065       CHECK_NUMBER (code1, 1);
1066       c1 = XINT (code1);
1067     }
1068   if (NILP (code2))
1069     c2 = 0;
1070   else
1071     {
1072       CHECK_NUMBER (code2, 2);
1073       c2 = XINT (code2);
1074     }
1075
1076   if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1077     error ("Invalid code points: %d %d", c1, c2);
1078   c1 &= 0x7F;
1079   c2 &= 0x7F;
1080   if (c1 == 0
1081       ? c2 != 0
1082       : (c2 == 0
1083          ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
1084          : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
1085     error ("Invalid code points: %d %d", c1, c2);
1086
1087   return make_number (MAKE_CHAR (charset_id, c1, c2));
1088 }
1089
1090 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1091   "Return list of charset and one or two position-codes of CHAR.\n\
1092 If CHAR is invalid as a character code,\n\
1093 return a list of symbol `unknown' and CHAR.")
1094   (ch)
1095      Lisp_Object ch;
1096 {
1097   Lisp_Object val;
1098   int c, charset, c1, c2;
1099
1100   CHECK_NUMBER (ch, 0);
1101   c = XFASTINT (ch);
1102   if (!CHAR_VALID_P (c, 1))
1103     return Fcons (Qunknown, Fcons (ch, Qnil));
1104   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1105   return (c2 >= 0
1106           ? Fcons (CHARSET_SYMBOL (charset),
1107                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1108           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1109 }
1110
1111 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1112   "Return charset of CHAR.")
1113   (ch)
1114      Lisp_Object ch;
1115 {
1116   CHECK_NUMBER (ch, 0);
1117
1118   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1119 }
1120
1121 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1122   "Return charset of a character in the current buffer at position POS.\n\
1123 If POS is nil, it defauls to the current point.\n\
1124 If POS is out of range, the value is nil.")
1125   (pos)
1126      Lisp_Object pos;
1127 {
1128   register int pos_byte, bytes, charset, c1, c2;
1129   register unsigned char *p;
1130
1131   if (NILP (pos))
1132     pos_byte = PT_BYTE;
1133   else if (MARKERP (pos))
1134     {
1135       pos_byte = marker_byte_position (pos);
1136       if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1137         return Qnil;
1138     }
1139   else
1140     {
1141       CHECK_NUMBER (pos, 0);
1142       if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1143         return Qnil;
1144       pos_byte = CHAR_TO_BYTE (XINT (pos));
1145     }
1146   p = BYTE_POS_ADDR (pos_byte);
1147   if (BASE_LEADING_CODE_P (*p))
1148     {
1149       SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1150       if (charset < 0)
1151         charset = 1;
1152     }
1153   else
1154     charset = CHARSET_ASCII;
1155
1156   return CHARSET_SYMBOL (charset);
1157 }
1158
1159 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1160   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1161 \n\
1162 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1163 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1164 where as Emacs distinguishes them by charset symbol.\n\
1165 See the documentation of the function `charset-info' for the meanings of\n\
1166 DIMENSION, CHARS, and FINAL-CHAR.")
1167   (dimension, chars, final_char)
1168      Lisp_Object dimension, chars, final_char;
1169 {
1170   int charset;
1171
1172   CHECK_NUMBER (dimension, 0);
1173   CHECK_NUMBER (chars, 1);
1174   CHECK_NUMBER (final_char, 2);
1175
1176   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1177     return Qnil;
1178   return CHARSET_SYMBOL (charset);
1179 }
1180
1181 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1182    generic character.  If GENERICP is zero, return nonzero iff C is a
1183    valid normal character.  Do not call this function directly,
1184    instead use macro CHAR_VALID_P.  */
1185 int
1186 char_valid_p (c, genericp)
1187      int c, genericp;
1188 {
1189   int charset, c1, c2;
1190
1191   if (c < 0)
1192     return 0;
1193   if (SINGLE_BYTE_CHAR_P (c))
1194     return 1;
1195   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1196   if (charset == CHARSET_COMPOSITION)
1197     return ((c >= MIN_CHAR_COMPOSITION
1198              && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1199             || (genericp && c == GENERIC_COMPOSITION_CHAR));
1200   if (genericp)
1201     {
1202       if (c1)
1203         {
1204           if (c2 <= 0) c2 = 0x20;
1205         }
1206       else
1207         {
1208           if (c2 <= 0) c1 = c2 = 0x20;
1209         }
1210     }
1211   return (CHARSET_DEFINED_P (charset)
1212           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1213 }
1214
1215 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1216   "Return t if OBJECT is a valid normal character.\n\
1217 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1218 a valid generic character.")
1219   (object, genericp)
1220      Lisp_Object object, genericp;
1221 {
1222   if (! NATNUMP (object))
1223     return Qnil;
1224   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1225 }
1226
1227 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1228        Sunibyte_char_to_multibyte, 1, 1, 0,
1229   "Convert the unibyte character CH to multibyte character.\n\
1230 The conversion is done based on `nonascii-translation-table' (which see)\n\
1231  or `nonascii-insert-offset' (which see).")
1232   (ch)
1233      Lisp_Object ch;
1234 {
1235   int c;
1236
1237   CHECK_NUMBER (ch, 0);
1238   c = XINT (ch);
1239   if (c < 0 || c >= 0400)
1240     error ("Invalid unibyte character: %d", c);
1241   c = unibyte_char_to_multibyte (c);
1242   if (c < 0)
1243     error ("Can't convert to multibyte character: %d", XINT (ch));
1244   return make_number (c);
1245 }
1246
1247 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1248        Smultibyte_char_to_unibyte, 1, 1, 0,
1249   "Convert the multibyte character CH to unibyte character.\n\
1250 The conversion is done based on `nonascii-translation-table' (which see)\n\
1251  or `nonascii-insert-offset' (which see).")
1252   (ch)
1253      Lisp_Object ch;
1254 {
1255   int c;
1256
1257   CHECK_NUMBER (ch, 0);
1258   c = XINT (ch);
1259   if (! CHAR_VALID_P (c, 0))
1260     error ("Invalid multibyte character: %d", c);
1261   c = multibyte_char_to_unibyte (c, Qnil);
1262   if (c < 0)
1263     error ("Can't convert to unibyte character: %d", XINT (ch));
1264   return make_number (c);
1265 }
1266
1267 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1268   "Return 1 regardless of the argument CHAR.\n\
1269 This is now an obsolete function.  We keep it just for backward compatibility.")
1270   (ch)
1271      Lisp_Object ch;
1272 {
1273   Lisp_Object val;
1274
1275   CHECK_NUMBER (ch, 0);
1276   return make_number (1);
1277 }
1278
1279 /* Return how many bytes C will occupy in a multibyte buffer.
1280    Don't call this function directly, instead use macro CHAR_BYTES.  */
1281 int
1282 char_bytes (c)
1283      int c;
1284 {
1285   int bytes;
1286
1287   if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1288     return 1;
1289
1290   if (COMPOSITE_CHAR_P (c))
1291     {
1292       unsigned int id = COMPOSITE_CHAR_ID (c);
1293
1294       bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1295     }
1296   else
1297     {
1298       int charset = CHAR_CHARSET (c);
1299
1300       bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1301     }
1302
1303   return bytes;
1304 }
1305
1306 /* Return the width of character of which multi-byte form starts with
1307    C.  The width is measured by how many columns occupied on the
1308    screen when displayed in the current buffer.  */
1309
1310 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1311   (c < 0x20                                                             \
1312    ? (c == '\t'                                                         \
1313       ? XFASTINT (current_buffer->tab_width)                            \
1314       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1315    : (c < 0x7f                                                          \
1316       ? 1                                                               \
1317       : (c == 0x7F                                                      \
1318          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1319          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1320              && BASE_LEADING_CODE_P (c))                                \
1321             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1322             : 4))))
1323
1324 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1325   "Return width of CHAR when displayed in the current buffer.\n\
1326 The width is measured by how many columns it occupies on the screen.")
1327   (ch)
1328        Lisp_Object ch;
1329 {
1330   Lisp_Object val, disp;
1331   int c;
1332   struct Lisp_Char_Table *dp = buffer_display_table ();
1333
1334   CHECK_NUMBER (ch, 0);
1335
1336   c = XINT (ch);
1337
1338   /* Get the way the display table would display it.  */
1339   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1340
1341   if (VECTORP (disp))
1342     XSETINT (val, XVECTOR (disp)->size);
1343   else if (SINGLE_BYTE_CHAR_P (c))
1344     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1345   else if (COMPOSITE_CHAR_P (c))
1346     {
1347       int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
1348       XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
1349     }
1350   else
1351     {
1352       int charset = CHAR_CHARSET (c);
1353
1354       XSETFASTINT (val, CHARSET_WIDTH (charset));
1355     }
1356   return val;
1357 }
1358
1359 /* Return width of string STR of length LEN when displayed in the
1360    current buffer.  The width is measured by how many columns it
1361    occupies on the screen.  */
1362
1363 int
1364 strwidth (str, len)
1365      unsigned char *str;
1366      int len;
1367 {
1368   unsigned char *endp = str + len;
1369   int width = 0;
1370   struct Lisp_Char_Table *dp = buffer_display_table ();
1371
1372   while (str < endp)
1373     {
1374       if (*str == LEADING_CODE_COMPOSITION)
1375         {
1376           int id = str_cmpchar_id (str, endp - str);
1377
1378           if (id < 0)
1379             {
1380               width += 4;
1381               str++;
1382             }
1383           else
1384             {
1385               width += cmpchar_table[id]->width;
1386               str += cmpchar_table[id]->len;
1387             }
1388         }
1389       else
1390         {
1391           Lisp_Object disp;
1392           int thislen;
1393           int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1394
1395           /* Get the way the display table would display it.  */
1396           if (dp)
1397             disp = DISP_CHAR_VECTOR (dp, c);
1398           else
1399             disp = Qnil;
1400
1401           if (VECTORP (disp))
1402             width += XVECTOR (disp)->size;
1403           else
1404             width += ONE_BYTE_CHAR_WIDTH (*str);
1405
1406           str += thislen;
1407         }
1408     }
1409   return width;
1410 }
1411
1412 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1413   "Return width of STRING when displayed in the current buffer.\n\
1414 Width is measured by how many columns it occupies on the screen.\n\
1415 When calculating width of a multibyte character in STRING,\n\
1416 only the base leading-code is considered; the validity of\n\
1417 the following bytes is not checked.")
1418   (str)
1419      Lisp_Object str;
1420 {
1421   Lisp_Object val;
1422
1423   CHECK_STRING (str, 0);
1424   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1425                               STRING_BYTES (XSTRING (str))));
1426   return val;
1427 }
1428
1429 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1430   "Return the direction of CHAR.\n\
1431 The returned value is 0 for left-to-right and 1 for right-to-left.")
1432   (ch)
1433      Lisp_Object ch;
1434 {
1435   int charset;
1436
1437   CHECK_NUMBER (ch, 0);
1438   charset = CHAR_CHARSET (XFASTINT (ch));
1439   if (!CHARSET_DEFINED_P (charset))
1440     invalid_character (XINT (ch));
1441   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1442 }
1443
1444 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1445   "Return number of characters between BEG and END.")
1446   (beg, end)
1447      Lisp_Object beg, end;
1448 {
1449   int from, to;
1450
1451   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1452   CHECK_NUMBER_COERCE_MARKER (end, 1);
1453
1454   from = min (XFASTINT (beg), XFASTINT (end));
1455   to = max (XFASTINT (beg), XFASTINT (end));
1456
1457   return make_number (to - from);
1458 }
1459
1460 /* Return the number of characters in the NBYTES bytes at PTR.
1461    This works by looking at the contents and checking for multibyte sequences.
1462    However, if the current buffer has enable-multibyte-characters = nil,
1463    we treat each byte as a character.  */
1464
1465 int
1466 chars_in_text (ptr, nbytes)
1467      unsigned char *ptr;
1468      int nbytes;
1469 {
1470   /* current_buffer is null at early stages of Emacs initialization.  */
1471   if (current_buffer == 0
1472       || NILP (current_buffer->enable_multibyte_characters))
1473     return nbytes;
1474
1475   return multibyte_chars_in_text (ptr, nbytes);
1476 }
1477
1478 /* Return the number of characters in the NBYTES bytes at PTR.
1479    This works by looking at the contents and checking for multibyte sequences.
1480    It ignores enable-multibyte-characters.  */
1481
1482 int
1483 multibyte_chars_in_text (ptr, nbytes)
1484      unsigned char *ptr;
1485      int nbytes;
1486 {
1487   unsigned char *endp;
1488   int chars, bytes;
1489
1490   endp = ptr + nbytes;
1491   chars = 0;
1492
1493   while (ptr < endp)
1494     {
1495       if (BASE_LEADING_CODE_P (*ptr))
1496         {
1497           PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1498           ptr += bytes;
1499           nbytes -= bytes;
1500         }
1501       else
1502         {
1503           ptr++;
1504           nbytes--;
1505         }
1506       chars++;
1507     }
1508
1509   return chars;
1510 }
1511
1512 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1513   "Concatenate all the argument characters and make the result a string.")
1514   (n, args)
1515      int n;
1516      Lisp_Object *args;
1517 {
1518   int i;
1519   unsigned char *buf
1520     = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1521   unsigned char *p = buf;
1522   Lisp_Object val;
1523
1524   for (i = 0; i < n; i++)
1525     {
1526       int c, len;
1527       unsigned char *str;
1528
1529       if (!INTEGERP (args[i]))
1530         CHECK_NUMBER (args[i], 0);
1531       c = XINT (args[i]);
1532       len = CHAR_STRING (c, p, str);
1533       if (p != str)
1534         /* C is a composite character.  */
1535         bcopy (str, p, len);
1536       p += len;
1537     }
1538
1539   /* Here, we can't use make_string_from_bytes because of byte
1540      combining problem.  */
1541   val = make_string (buf, p - buf);
1542   return val;
1543 }
1544
1545 #endif /* emacs */
1546 \f
1547 /*** Composite characters staffs ***/
1548
1549 /* Each composite character is identified by CMPCHAR-ID which is
1550    assigned when Emacs needs the character code of the composite
1551    character (e.g. when displaying it on the screen).  See the
1552    document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1553    composite character is represented in Emacs.  */
1554
1555 /* If `static' is defined, it means that it is defined to null string. */
1556 #ifndef static
1557 /* The following function is copied from lread.c.  */
1558 static int
1559 hash_string (ptr, len)
1560      unsigned char *ptr;
1561      int len;
1562 {
1563   register unsigned char *p = ptr;
1564   register unsigned char *end = p + len;
1565   register unsigned char c;
1566   register int hash = 0;
1567
1568   while (p != end)
1569     {
1570       c = *p++;
1571       if (c >= 0140) c -= 40;
1572       hash = ((hash<<3) + (hash>>28) + c);
1573     }
1574   return hash & 07777777777;
1575 }
1576 #endif
1577
1578 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1579
1580 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1581
1582 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1583    integer, where the 1st element is the size of the array, the 2nd
1584    element is how many elements are actually used in the array, and
1585    the remaining elements are CMPCHAR-IDs of composite characters of
1586    the same hash value.  */
1587 #define CMPCHAR_HASH_SIZE(table) table[0]
1588 #define CMPCHAR_HASH_USED(table) table[1]
1589 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1590
1591 /* Return CMPCHAR-ID of the composite character in STR of the length
1592    LEN.  If the composite character has not yet been registered,
1593    register it in `cmpchar_table' and assign new CMPCHAR-ID.  This
1594    is the sole function for assigning CMPCHAR-ID.  */
1595 int
1596 str_cmpchar_id (str, len)
1597      const unsigned char *str;
1598      int len;
1599 {
1600   int hash_idx, *hashp;
1601   unsigned char *buf;
1602   int embedded_rule;            /* 1 if composition rule is embedded.  */
1603   int chars;                    /* number of components.  */
1604   int i;
1605   struct cmpchar_info *cmpcharp;
1606
1607   /* The second byte 0xFF means COMPOSITION rule is embedded.  */
1608   embedded_rule = (str[1] == 0xFF);
1609
1610   /* At first, get the actual length of the composite character.  */
1611   {
1612     const unsigned char *p, *endp = str + 1, *lastp = str + len;
1613     int bytes;
1614
1615     while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1616     if (endp - str < 5)
1617       /* Any composite char have at least 5-byte length.  */
1618       return -1;
1619
1620     chars = 0;
1621     p = str + 1;
1622     while (p < endp)
1623       {
1624         if (embedded_rule)
1625           {
1626             p++;
1627             if (p >= endp)
1628               return -1;
1629           }
1630         /* No need of checking if *P is 0xA0 because
1631            BYTES_BY_CHAR_HEAD (0x80) surely returns 2.  */
1632         p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1633         chars++;
1634       }
1635     if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1636       /* Invalid components.  */
1637       return -1;
1638     len = p - str;
1639   }
1640   hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1641   hashp = cmpchar_hash_table[hash_idx];
1642
1643   /* Then, look into the hash table.  */
1644   if (hashp != NULL)
1645     /* Find the correct one among composite characters of the same
1646        hash value.  */
1647     for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1648       {
1649         cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1650         if (len == cmpcharp->len
1651             && ! bcmp (str, cmpcharp->data, len))
1652           return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1653       }
1654
1655   /* We have to register the composite character in cmpchar_table.  */
1656   if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1657     /* No, we have no more room for a new composite character.  */
1658     return -1;
1659
1660   /* Make the entry in hash table.  */
1661   if (hashp == NULL)
1662     {
1663       /* Make a table for 8 composite characters initially.  */
1664       hashp = (cmpchar_hash_table[hash_idx]
1665                = (int *) xmalloc (sizeof (int) * (2 + 8)));
1666       CMPCHAR_HASH_SIZE (hashp) = 10;
1667       CMPCHAR_HASH_USED (hashp) = 2;
1668     }
1669   else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1670     {
1671       CMPCHAR_HASH_SIZE (hashp) += 8;
1672       hashp = (cmpchar_hash_table[hash_idx]
1673                = (int *) xrealloc (hashp,
1674                                    sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1675     }
1676   CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1677   CMPCHAR_HASH_USED (hashp)++;
1678
1679   /* Set information of the composite character in cmpchar_table.  */
1680   if (cmpchar_table_size == 0)
1681     {
1682       /* This is the first composite character to be registered.  */
1683       cmpchar_table_size = 256;
1684       cmpchar_table
1685         = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1686                                             * cmpchar_table_size);
1687     }
1688   else if (cmpchar_table_size <= n_cmpchars)
1689     {
1690       cmpchar_table_size += 256;
1691       cmpchar_table
1692         = (struct cmpchar_info **) xrealloc (cmpchar_table,
1693                                              sizeof (cmpchar_table[0])
1694                                              * cmpchar_table_size);
1695     }
1696
1697   cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1698
1699   cmpcharp->len = len;
1700   cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1701   bcopy (str, cmpcharp->data, len);
1702   cmpcharp->data[len] = 0;
1703   cmpcharp->glyph_len = chars;
1704   cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1705   if (embedded_rule)
1706     {
1707       cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1708       cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1709     }
1710   else
1711     {
1712       cmpcharp->cmp_rule = NULL;
1713       cmpcharp->col_offset = NULL;
1714     }
1715
1716   /* Setup GLYPH data and composition rules (if any) so as not to make
1717      them every time on displaying.  */
1718   {
1719     unsigned char *bufp;
1720     int width;
1721     float leftmost = 0.0, rightmost = 1.0;
1722
1723     if (embedded_rule)
1724       /* At first, col_offset[N] is set to relative to col_offset[0].  */
1725       cmpcharp->col_offset[0] = 0;
1726
1727     for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1728       {
1729         if (embedded_rule)
1730           cmpcharp->cmp_rule[i] = *bufp++;
1731
1732         if (*bufp == 0xA0)      /* This is an ASCII character.  */
1733           {
1734             cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1735             width = 1;
1736             bufp++;
1737           }
1738         else                    /* Multibyte character.  */
1739           {
1740             /* Make `bufp' point normal multi-byte form temporally.  */
1741             *bufp -= 0x20;
1742             cmpcharp->glyph[i]
1743               = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1744             width = WIDTH_BY_CHAR_HEAD (*bufp);
1745             *bufp += 0x20;
1746             bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1747           }
1748
1749         if (embedded_rule && i > 0)
1750           {
1751             /* Reference points (global_ref and new_ref) are
1752                encoded as below:
1753
1754                0--1--2 -- ascent
1755                |     |
1756                |     |
1757                |  4 -+--- center
1758             -- 3     5 -- baseline
1759                |     |
1760                6--7--8 -- descent
1761
1762                Now, we calculate the column offset of the new glyph
1763                from the left edge of the first glyph.  This can avoid
1764                the same calculation everytime displaying this
1765                composite character.  */
1766
1767             /* Reference points of global glyph and new glyph.  */
1768             int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1769             int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1770             /* Column offset relative to the first glyph.  */
1771             float left = (leftmost
1772                           + (global_ref % 3) * (rightmost - leftmost) / 2.0
1773                           - (new_ref % 3) * width / 2.0);
1774
1775             cmpcharp->col_offset[i] = left;
1776             if (left < leftmost)
1777               leftmost = left;
1778             if (left + width > rightmost)
1779               rightmost = left + width;
1780           }
1781         else
1782           {
1783             if (width > rightmost)
1784               rightmost = width;
1785           }
1786       }
1787     if (embedded_rule)
1788       {
1789         /* Now col_offset[N] are relative to the left edge of the
1790            first component.  Make them relative to the left edge of
1791            overall glyph.  */
1792         for (i = 0; i < chars; i++)
1793           cmpcharp->col_offset[i] -= leftmost;
1794         /* Make rightmost holds width of overall glyph.  */
1795         rightmost -= leftmost;
1796       }
1797
1798     cmpcharp->width = rightmost;
1799     if (cmpcharp->width < rightmost)
1800       /* To get a ceiling integer value.  */
1801       cmpcharp->width++;
1802   }
1803
1804   cmpchar_table[n_cmpchars] = cmpcharp;
1805
1806   return n_cmpchars++;
1807 }
1808
1809 /* Return the Nth element of the composite character C.  If NOERROR is
1810    nonzero, return 0 on error condition (C is an invalid composite
1811    charcter, or N is out of range). */
1812 int
1813 cmpchar_component (c, n, noerror)
1814      int c, n, noerror;
1815 {
1816   int id = COMPOSITE_CHAR_ID (c);
1817
1818   if (id < 0 || id >= n_cmpchars)
1819     {
1820       /* C is not a valid composite character.  */
1821       if (noerror) return 0;
1822       error ("Invalid composite character: %d", c)  ;
1823     }
1824   if (n >= cmpchar_table[id]->glyph_len)
1825     {
1826       /* No such component.  */
1827       if (noerror) return 0;
1828       args_out_of_range (make_number (c), make_number (n));
1829     }
1830   /* No face data is stored in glyph code.  */
1831   return ((int) (cmpchar_table[id]->glyph[n]));
1832 }
1833
1834 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1835   "T if CHAR is a composite character.")
1836   (ch)
1837      Lisp_Object ch;
1838 {
1839   CHECK_NUMBER (ch, 0);
1840   return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1841 }
1842
1843 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1844        2, 2, 0,
1845   "Return the Nth component character of composite character CHARACTER.")
1846   (character, n)
1847      Lisp_Object character, n;
1848 {
1849   int id;
1850
1851   CHECK_NUMBER (character, 0);
1852   CHECK_NUMBER (n, 1);
1853
1854   return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
1855 }
1856
1857 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1858        2, 2, 0,
1859   "Return the Nth composition rule of composite character CHARACTER.\n\
1860 The returned rule is for composing the Nth component\n\
1861 on the (N-1)th component.\n\
1862 If CHARACTER should be composed relatively or N is 0, return 255.")
1863   (character, n)
1864      Lisp_Object character, n;
1865 {
1866   int id;
1867
1868   CHECK_NUMBER (character, 0);
1869   CHECK_NUMBER (n, 1);
1870
1871   id = COMPOSITE_CHAR_ID (XINT (character));
1872   if (id < 0 || id >= n_cmpchars)
1873     error ("Invalid composite character: %d", XINT (character));
1874   if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
1875     args_out_of_range (character, n);
1876
1877   return make_number (cmpchar_table[id]->cmp_rule
1878                       ? cmpchar_table[id]->cmp_rule[XINT (n)]
1879                       : 255);
1880 }
1881
1882 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1883        Scmpchar_cmp_rule_p, 1, 1, 0,
1884   "Return non-nil if composite character CHARACTER contains a embedded rule.")
1885   (character)
1886      Lisp_Object character;
1887 {
1888   int id;
1889
1890   CHECK_NUMBER (character, 0);
1891   id = COMPOSITE_CHAR_ID (XINT (character));
1892   if (id < 0 || id >= n_cmpchars)
1893     error ("Invalid composite character: %d", XINT (character));
1894
1895   return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1896 }
1897
1898 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1899        Scmpchar_cmp_count, 1, 1, 0,
1900   "Return number of compoents of composite character CHARACTER.")
1901   (character)
1902      Lisp_Object character;
1903 {
1904   int id;
1905
1906   CHECK_NUMBER (character, 0);
1907   id = COMPOSITE_CHAR_ID (XINT (character));
1908   if (id < 0 || id >= n_cmpchars)
1909     error ("Invalid composite character: %d", XINT (character));
1910
1911   return (make_number (cmpchar_table[id]->glyph_len));
1912 }
1913
1914 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1915        1, 1, 0,
1916   "Return one char string composed from all characters in STRING.")
1917   (str)
1918      Lisp_Object str;
1919 {
1920   unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1921   int len, i;
1922
1923   CHECK_STRING (str, 0);
1924
1925   buf[0] = LEADING_CODE_COMPOSITION;
1926   p = XSTRING (str)->data;
1927   pend = p + STRING_BYTES (XSTRING (str));
1928   i = 1;
1929   while (p < pend)
1930     {
1931       if (*p < 0x20) /* control code */
1932         error ("Invalid component character: %d", *p);
1933       else if (*p < 0x80)       /* ASCII */
1934         {
1935           if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1936             error ("Too long string to be composed: %s", XSTRING (str)->data);
1937           /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1938              code itself.  */
1939           buf[i++] = 0xA0;
1940           buf[i++] = *p++ + 0x80;
1941         }
1942       else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1943         {
1944           /* Already composed.  Eliminate the heading
1945              LEADING_CODE_COMPOSITION, keep the remaining bytes
1946              unchanged.  */
1947           p++;
1948           if (*p == 255)
1949             error ("Can't compose a rule-based composition character");
1950           ptemp = p;
1951           while (! CHAR_HEAD_P (*p)) p++;
1952           if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1953             error ("Can't compose an invalid composition character");
1954           if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1955             error ("Too long string to be composed: %s", XSTRING (str)->data);
1956           bcopy (ptemp, buf + i, p - ptemp);
1957           i += p - ptemp;
1958         }
1959       else                      /* multibyte char */
1960         {
1961           /* Add 0x20 to the base leading-code, keep the remaining
1962              bytes unchanged.  */
1963           int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
1964
1965           if (len <= 1 || ! CHAR_VALID_P (c, 0))
1966             error ("Can't compose an invalid character");
1967           if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1968             error ("Too long string to be composed: %s", XSTRING (str)->data);
1969           bcopy (p, buf + i, len);
1970           buf[i] += 0x20;
1971           p += len, i += len;
1972         }
1973     }
1974
1975   if (i < 5)
1976     /* STR contains only one character, which can't be composed.  */
1977     error ("Too short string to be composed: %s", XSTRING (str)->data);
1978
1979   return make_string_from_bytes (buf, 1, i);
1980 }
1981
1982 \f
1983 int
1984 charset_id_internal (charset_name)
1985      char *charset_name;
1986 {
1987   Lisp_Object val;
1988
1989   val= Fget (intern (charset_name), Qcharset);
1990   if (!VECTORP (val))
1991     error ("Charset %s is not defined", charset_name);
1992
1993   return (XINT (XVECTOR (val)->contents[0]));
1994 }
1995
1996 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1997        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1998    ()
1999 {
2000   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
2001   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
2002   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
2003   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
2004   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
2005   charset_big5_1 = charset_id_internal ("chinese-big5-1");
2006   charset_big5_2 = charset_id_internal ("chinese-big5-2");
2007   return Qnil;
2008 }
2009
2010 void
2011 init_charset_once ()
2012 {
2013   int i, j, k;
2014
2015   staticpro (&Vcharset_table);
2016   staticpro (&Vcharset_symbol_table);
2017   staticpro (&Vgeneric_character_list);
2018
2019   /* This has to be done here, before we call Fmake_char_table.  */
2020   Qcharset_table = intern ("charset-table");
2021   staticpro (&Qcharset_table);
2022
2023   /* Intern this now in case it isn't already done.
2024      Setting this variable twice is harmless.
2025      But don't staticpro it here--that is done in alloc.c.  */
2026   Qchar_table_extra_slots = intern ("char-table-extra-slots");
2027
2028   /* Now we are ready to set up this property, so we can
2029      create the charset table.  */
2030   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2031   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2032
2033   Qunknown = intern ("unknown");
2034   staticpro (&Qunknown);
2035   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2036                                         Qunknown);
2037
2038   /* Setup tables.  */
2039   for (i = 0; i < 2; i++)
2040     for (j = 0; j < 2; j++)
2041       for (k = 0; k < 128; k++)
2042         iso_charset_table [i][j][k] = -1;
2043
2044   bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2045   cmpchar_table_size = n_cmpchars = 0;
2046
2047   for (i = 0; i < 256; i++)
2048     BYTES_BY_CHAR_HEAD (i) = 1;
2049   for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2050        i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2051     BYTES_BY_CHAR_HEAD (i) = 2;
2052   for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2053        i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2054     BYTES_BY_CHAR_HEAD (i) = 3;
2055   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2056   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2057   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2058   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
2059   /* The followings don't reflect the actual bytes, but just to tell
2060      that it is a start of a multibyte character.  */
2061   BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
2062   BYTES_BY_CHAR_HEAD (0x9E) = 2;
2063   BYTES_BY_CHAR_HEAD (0x9F) = 2;
2064
2065   for (i = 0; i < 128; i++)
2066     WIDTH_BY_CHAR_HEAD (i) = 1;
2067   for (; i < 256; i++)
2068     WIDTH_BY_CHAR_HEAD (i) = 4;
2069   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2070   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2071   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2072   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
2073
2074   {
2075     Lisp_Object val;
2076
2077     val = Qnil;
2078     for (i = 0x81; i < 0x90; i++)
2079       val = Fcons (make_number ((i - 0x70) << 7), val);
2080     for (; i < 0x9A; i++)
2081       val = Fcons (make_number ((i - 0x8F) << 14), val);
2082     for (i = 0xA0; i < 0xF0; i++)
2083       val = Fcons (make_number ((i - 0x70) << 7), val);
2084     for (; i < 0xFF; i++)
2085       val = Fcons (make_number ((i - 0xE0) << 14), val);
2086     val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2087     Vgeneric_character_list = Fnreverse (val);
2088   }
2089
2090   nonascii_insert_offset = 0;
2091   Vnonascii_translation_table = Qnil;
2092 }
2093
2094 #ifdef emacs
2095
2096 void
2097 syms_of_charset ()
2098 {
2099   Qascii = intern ("ascii");
2100   staticpro (&Qascii);
2101
2102   Qcharset = intern ("charset");
2103   staticpro (&Qcharset);
2104
2105   /* Define ASCII charset now.  */
2106   update_charset_table (make_number (CHARSET_ASCII),
2107                         make_number (1), make_number (94),
2108                         make_number (1),
2109                         make_number (0),
2110                         make_number ('B'),
2111                         make_number (0),
2112                         build_string ("ASCII"),
2113                         build_string ("ASCII"),
2114                         build_string ("ASCII (ISO646 IRV)"));
2115   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2116   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2117
2118   Qcomposition = intern ("composition");
2119   staticpro (&Qcomposition);
2120   CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2121
2122   Qauto_fill_chars = intern ("auto-fill-chars");
2123   staticpro (&Qauto_fill_chars);
2124   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2125
2126   defsubr (&Sdefine_charset);
2127   defsubr (&Sgeneric_character_list);
2128   defsubr (&Sget_unused_iso_final_char);
2129   defsubr (&Sdeclare_equiv_charset);
2130   defsubr (&Sfind_charset_region);
2131   defsubr (&Sfind_charset_string);
2132   defsubr (&Smake_char_internal);
2133   defsubr (&Ssplit_char);
2134   defsubr (&Schar_charset);
2135   defsubr (&Scharset_after);
2136   defsubr (&Siso_charset);
2137   defsubr (&Schar_valid_p);
2138   defsubr (&Sunibyte_char_to_multibyte);
2139   defsubr (&Smultibyte_char_to_unibyte);
2140   defsubr (&Schar_bytes);
2141   defsubr (&Schar_width);
2142   defsubr (&Sstring_width);
2143   defsubr (&Schar_direction);
2144   defsubr (&Schars_in_region);
2145   defsubr (&Sstring);
2146   defsubr (&Scmpcharp);
2147   defsubr (&Scmpchar_component);
2148   defsubr (&Scmpchar_cmp_rule);
2149   defsubr (&Scmpchar_cmp_rule_p);
2150   defsubr (&Scmpchar_cmp_count);
2151   defsubr (&Scompose_string);
2152   defsubr (&Ssetup_special_charsets);
2153
2154   DEFVAR_LISP ("charset-list", &Vcharset_list,
2155     "List of charsets ever defined.");
2156   Vcharset_list = Fcons (Qascii, Qnil);
2157
2158   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
2159     "Vector of cons cell of a symbol and translation table ever defined.\n\
2160 An ID of a translation table is an index of this vector.");
2161   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
2162
2163   DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2164     "Leading-code of composite characters.");
2165   leading_code_composition = LEADING_CODE_COMPOSITION;
2166
2167   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2168     "Leading-code of private TYPE9N charset of column-width 1.");
2169   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2170
2171   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2172     "Leading-code of private TYPE9N charset of column-width 2.");
2173   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2174
2175   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2176     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2177   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2178
2179   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2180     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2181   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
2182
2183   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
2184     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
2185 This is used for converting unibyte text to multibyte,\n\
2186 and for inserting character codes specified by number.\n\n\
2187 This serves to convert a Latin-1 or similar 8-bit character code\n\
2188 to the corresponding Emacs multibyte character code.\n\
2189 Typically the value should be (- (make-char CHARSET 0) 128),\n\
2190 for your choice of character set.\n\
2191 If `nonascii-translation-table' is non-nil, it overrides this variable.");
2192   nonascii_insert_offset = 0;
2193
2194   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
2195     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
2196 This is used for converting unibyte text to multibyte,\n\
2197 and for inserting character codes specified by number.\n\n\
2198 Conversion is performed only when multibyte characters are enabled,\n\
2199 and it serves to convert a Latin-1 or similar 8-bit character code\n\
2200 to the corresponding Emacs character code.\n\n\
2201 If this is nil, `nonascii-insert-offset' is used instead.\n\
2202 See also the docstring of `make-translation-table'.");
2203   Vnonascii_translation_table = Qnil;
2204
2205   DEFVAR_INT ("min-composite-char", &min_composite_char,
2206     "Minimum character code of a composite character.");
2207   min_composite_char = MIN_CHAR_COMPOSITION;
2208
2209   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2210     "A char-table for characters which invoke auto-filling.\n\
2211 Such characters has value t in this table.");
2212   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
2213   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2214   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
2215 }
2216
2217 #endif /* emacs */