code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 Lisp_Object Vchar_unify_table;
  63
  64 /* A char-table.  An element is non-nil iff the corresponding
  65    character has a printable glyph.  */
  66 Lisp_Object Vprintable_chars;
  67
  68 /* A char-table.  An elemnent is a column-width of the corresponding
  69    character.  */
  70 Lisp_Object Vchar_width_table;
  71
  72 /* A char-table.  An element is a symbol indicating the direction
  73    property of corresponding character.  */
  74 Lisp_Object Vchar_direction_table;
  75
  76 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  77 unsigned char *_fetch_multibyte_char_p;
  78 int _fetch_multibyte_char_len;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 static Lisp_Object Qchar_script_table;
  84
  85
  86 \f
  87
  88 int
  89 char_string_with_unification (c, p)
  90      int c;
  91      unsigned char *p;
  92 {
  93   int bytes;
  94
  95   MAYBE_UNIFY_CHAR (c);
  96
  97   if (c <= MAX_3_BYTE_CHAR || c > MAX_5_BYTE_CHAR)
  98     {
  99       bytes = CHAR_STRING (c, p);
 100     }
 101   else if (c <= MAX_4_BYTE_CHAR)
 102     {
 103       p[0] = (0xF0 | (c >> 18));
 104       p[1] = (0x80 | ((c >> 12) & 0x3F));
 105       p[2] = (0x80 | ((c >> 6) & 0x3F));
 106       p[3] = (0x80 | (c & 0x3F));
 107       bytes = 4;
 108     }
 109   else
 110     {
 111       p[0] = 0xF8;
 112       p[1] = (0x80 | ((c >> 18) & 0x0F));
 113       p[2] = (0x80 | ((c >> 12) & 0x3F));
 114       p[3] = (0x80 | ((c >> 6) & 0x3F));
 115       p[4] = (0x80 | (c & 0x3F));
 116       bytes = 5;
 117     }
 118
 119   return bytes;
 120 }
 121
 122
 123 int
 124 string_char_with_unification (p, advanced, len)
 125      const unsigned char *p;
 126      const unsigned char **advanced;
 127      int *len;
 128 {
 129   int c;
 130   const unsigned char *saved_p = p;
 131
 132   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 133     {
 134       c = STRING_CHAR_ADVANCE (p);
 135     }
 136   else if (! (*p & 0x08))
 137     {
 138       c = ((((p)[0] & 0xF) << 18)
 139            | (((p)[1] & 0x3F) << 12)
 140            | (((p)[2] & 0x3F) << 6)
 141            | ((p)[3] & 0x3F));
 142       p += 4;
 143     }
 144   else
 145     {
 146       c = ((((p)[1] & 0x3F) << 18)
 147            | (((p)[2] & 0x3F) << 12)
 148            | (((p)[3] & 0x3F) << 6)
 149            | ((p)[4] & 0x3F));
 150       p += 5;
 151     }
 152
 153   MAYBE_UNIFY_CHAR (c);
 154
 155   if (len)
 156     *len = p - saved_p;
 157   if (advanced)
 158     *advanced = p;
 159   return c;
 160 }
 161
 162
 163 /* Translate character C by translation table TABLE.  If C is
 164    negative, translate a character specified by CHARSET and CODE.  If
 165    no translation is found in TABLE, return the untranslated
 166    character.  */
 167
 168 int
 169 translate_char (table, c)
 170      Lisp_Object table;
 171      int c;
 172 {
 173   Lisp_Object ch;
 174
 175   if (! CHAR_TABLE_P (table))
 176     return c;
 177   ch = CHAR_TABLE_REF (table, c);
 178   if (! CHARACTERP (ch))
 179     return c;
 180   return XINT (ch);
 181 }
 182
 183 /* Convert the unibyte character C to the corresponding multibyte
 184    character based on the current value of charset_unibyte.  If C
 185    can't be converted, return C.  */
 186
 187 int
 188 unibyte_char_to_multibyte (c)
 189      int c;
 190 {
 191   struct charset *charset = CHARSET_FROM_ID (charset_unibyte);
 192   int c1 = DECODE_CHAR (charset, c);
 193
 194   return ((c1 >= 0) ? c1 : c);
 195 }
 196
 197
 198 /* Convert the multibyte character C to unibyte 8-bit character based
 199    on the current value of charset_unibyte.  If dimension of
 200    charset_unibyte is more than one, return (C & 0xFF).
 201
 202    The argument REV_TBL is now ignored.  It will be removed in the
 203    future.  */
 204
 205 int
 206 multibyte_char_to_unibyte (c, rev_tbl)
 207      int c;
 208      Lisp_Object rev_tbl;
 209 {
 210   struct charset *charset = CHARSET_FROM_ID (charset_unibyte);
 211   unsigned c1 = ENCODE_CHAR (charset, c);
 212
 213   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 214 }
 215
 216
 217 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 218        doc: /* Return non-nil if OBJECT is a character.  */)
 219      (object, ignore)
 220      Lisp_Object object, ignore;
 221 {
 222   return (CHARACTERP (object) ? Qt : Qnil);
 223 }
 224
 225 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 226        doc: /* Return the character of the maximum code.  */)
 227      ()
 228 {
 229   return make_number (MAX_CHAR);
 230 }
 231
 232 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 233        Sunibyte_char_to_multibyte, 1, 1, 0,
 234        doc: /* Convert the unibyte character CH to multibyte character.
 235 The multibyte character is a result of decoding CH by
 236 the current primary charset (value of `charset-primary').  */)
 237      (ch)
 238      Lisp_Object ch;
 239 {
 240   int c;
 241   struct charset *charset;
 242
 243   CHECK_CHARACTER (ch);
 244   c = XFASTINT (ch);
 245   if (c >= 0400)
 246     error ("Invalid unibyte character: %d", c);
 247   charset = CHARSET_FROM_ID (charset_unibyte);
 248   c = DECODE_CHAR (charset, c);
 249   if (c < 0)
 250     c = BYTE8_TO_CHAR (XFASTINT (ch));
 251   return make_number (c);
 252 }
 253
 254 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 255        Smultibyte_char_to_unibyte, 1, 1, 0,
 256        doc: /* Convert the multibyte character CH to unibyte character.\n\
 257 The unibyte character is a result of encoding CH by
 258 the current primary charset (value of `charset-primary').  */)
 259      (ch)
 260      Lisp_Object ch;
 261 {
 262   int c;
 263
 264   CHECK_CHARACTER (ch);
 265   c = XFASTINT (ch);
 266   c = CHAR_TO_BYTE8 (c);
 267   return make_number (c);
 268 }
 269
 270 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 271        doc: /* Return 1 regardless of the argument CHAR.
 272 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 273      (ch)
 274      Lisp_Object ch;
 275 {
 276   CHECK_CHARACTER (ch);
 277   return make_number (1);
 278 }
 279
 280 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 281        doc: /* Return width of CHAR when displayed in the current buffer.
 282 The width is measured by how many columns it occupies on the screen.
 283 Tab is taken to occupy `tab-width' columns.  */)
 284      (ch)
 285        Lisp_Object ch;
 286 {
 287   Lisp_Object disp;
 288   int c, width;
 289   struct Lisp_Char_Table *dp = buffer_display_table ();
 290
 291   CHECK_CHARACTER (ch);
 292   c = XINT (ch);
 293
 294   /* Get the way the display table would display it.  */
 295   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 296
 297   if (VECTORP (disp))
 298     width = ASIZE (disp);
 299   else
 300     width = CHAR_WIDTH (c);
 301
 302   return make_number (width);
 303 }
 304
 305 /* Return width of string STR of length LEN when displayed in the
 306    current buffer.  The width is measured by how many columns it
 307    occupies on the screen.  If PRECISION > 0, return the width of
 308    longest substring that doesn't exceed PRECISION, and set number of
 309    characters and bytes of the substring in *NCHARS and *NBYTES
 310    respectively.  */
 311
 312 int
 313 c_string_width (str, len, precision, nchars, nbytes)
 314      unsigned char *str;
 315      int precision, *nchars, *nbytes;
 316 {
 317   int i = 0, i_byte = 0;
 318   int width = 0;
 319   struct Lisp_Char_Table *dp = buffer_display_table ();
 320
 321   while (i_byte < len)
 322     {
 323       int bytes, thiswidth;
 324       Lisp_Object val;
 325       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 326
 327       if (dp)
 328         {
 329           val = DISP_CHAR_VECTOR (dp, c);
 330           if (VECTORP (val))
 331             thiswidth = XVECTOR (val)->size;
 332           else
 333             thiswidth = CHAR_WIDTH (c);
 334         }
 335       else
 336         {
 337           thiswidth = CHAR_WIDTH (c);
 338         }
 339
 340       if (precision > 0
 341           && (width + thiswidth > precision))
 342         {
 343           *nchars = i;
 344           *nbytes = i_byte;
 345           return width;
 346         }
 347       i++;
 348       i_byte += bytes;
 349       width += thiswidth;
 350   }
 351
 352   if (precision > 0)
 353     {
 354       *nchars = i;
 355       *nbytes = i_byte;
 356     }
 357
 358   return width;
 359 }
 360
 361 /* Return width of string STR of length LEN when displayed in the
 362    current buffer.  The width is measured by how many columns it
 363    occupies on the screen.  */
 364
 365 int
 366 strwidth (str, len)
 367      unsigned char *str;
 368      int len;
 369 {
 370   return c_string_width (str, len, -1, NULL, NULL);
 371 }
 372
 373 /* Return width of Lisp string STRING when displayed in the current
 374    buffer.  The width is measured by how many columns it occupies on
 375    the screen while paying attention to compositions.  If PRECISION >
 376    0, return the width of longest substring that doesn't exceed
 377    PRECISION, and set number of characters and bytes of the substring
 378    in *NCHARS and *NBYTES respectively.  */
 379
 380 int
 381 lisp_string_width (string, precision, nchars, nbytes)
 382      Lisp_Object string;
 383      int precision, *nchars, *nbytes;
 384 {
 385   int len = XSTRING (string)->size;
 386   unsigned char *str = XSTRING (string)->data;
 387   int i = 0, i_byte = 0;
 388   int width = 0;
 389   struct Lisp_Char_Table *dp = buffer_display_table ();
 390
 391   while (i < len)
 392     {
 393       int chars, bytes, thiswidth;
 394       Lisp_Object val;
 395       int cmp_id;
 396       int ignore, end;
 397
 398       if (find_composition (i, -1, &ignore, &end, &val, string)
 399           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 400               >= 0))
 401         {
 402           thiswidth = composition_table[cmp_id]->width;
 403           chars = end - i;
 404           bytes = string_char_to_byte (string, end) - i_byte;
 405         }
 406       else if (dp)
 407         {
 408           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 409
 410           chars = 1;
 411           val = DISP_CHAR_VECTOR (dp, c);
 412           if (VECTORP (val))
 413             thiswidth = XVECTOR (val)->size;
 414           else
 415             thiswidth = CHAR_WIDTH (c);
 416         }
 417       else
 418         {
 419           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 420
 421           chars = 1;
 422           thiswidth = CHAR_WIDTH (c);
 423         }
 424
 425       if (precision > 0
 426           && (width + thiswidth > precision))
 427         {
 428           *nchars = i;
 429           *nbytes = i_byte;
 430           return width;
 431         }
 432       i += chars;
 433       i_byte += bytes;
 434       width += thiswidth;
 435   }
 436
 437   if (precision > 0)
 438     {
 439       *nchars = i;
 440       *nbytes = i_byte;
 441     }
 442
 443   return width;
 444 }
 445
 446 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 447        doc: /* Return width of STRING when displayed in the current buffer.
 448 Width is measured by how many columns it occupies on the screen.
 449 When calculating width of a multibyte character in STRING,
 450 only the base leading-code is considered; the validity of
 451 the following bytes is not checked.  Tabs in STRING are always
 452 taken to occupy `tab-width' columns.  */)
 453      (str)
 454      Lisp_Object str;
 455 {
 456   Lisp_Object val;
 457
 458   CHECK_STRING (str);
 459   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 460   return val;
 461 }
 462
 463 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 464        doc: /* Return the direction of CHAR.
 465 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 466      (ch)
 467      Lisp_Object ch;
 468 {
 469   int c;
 470
 471   CHECK_CHARACTER (ch);
 472   c = XINT (ch);
 473   return CHAR_TABLE_REF (Vchar_direction_table, c);
 474 }
 475
 476 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
 477        doc: /* Return number of characters between BEG and END.
 478 This is now an obsolete function.  We keep it just for backward compatibility.  */)
 479      (beg, end)
 480      Lisp_Object beg, end;
 481 {
 482   int from, to;
 483
 484   CHECK_NUMBER_COERCE_MARKER (beg);
 485   CHECK_NUMBER_COERCE_MARKER (end);
 486
 487   from = min (XFASTINT (beg), XFASTINT (end));
 488   to = max (XFASTINT (beg), XFASTINT (end));
 489
 490   return make_number (to - from);
 491 }
 492
 493 /* Return the number of characters in the NBYTES bytes at PTR.
 494    This works by looking at the contents and checking for multibyte
 495    sequences while assuming that there's no invalid sequence.
 496    However, if the current buffer has enable-multibyte-characters =
 497    nil, we treat each byte as a character.  */
 498
 499 int
 500 chars_in_text (ptr, nbytes)
 501      unsigned char *ptr;
 502      int nbytes;
 503 {
 504   /* current_buffer is null at early stages of Emacs initialization.  */
 505   if (current_buffer == 0
 506       || NILP (current_buffer->enable_multibyte_characters))
 507     return nbytes;
 508
 509   return multibyte_chars_in_text (ptr, nbytes);
 510 }
 511
 512 /* Return the number of characters in the NBYTES bytes at PTR.
 513    This works by looking at the contents and checking for multibyte
 514    sequences while assuming that there's no invalid sequence.  It
 515    ignores enable-multibyte-characters.  */
 516
 517 int
 518 multibyte_chars_in_text (ptr, nbytes)
 519      unsigned char *ptr;
 520      int nbytes;
 521 {
 522   unsigned char *endp = ptr + nbytes;
 523   int chars = 0;
 524
 525   while (ptr < endp)
 526     {
 527       int len = MULTIBYTE_LENGTH (ptr, endp);
 528
 529       if (len == 0)
 530         abort ();
 531       ptr += len;
 532       chars++;
 533     }
 534
 535   return chars;
 536 }
 537
 538 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 539    characters and bytes in it, and store them in *NCHARS and *NBYTES
 540    respectively.  On counting bytes, pay attention to that 8-bit
 541    characters not constructing a valid multibyte sequence are
 542    represented by 2-byte in a multibyte text.  */
 543
 544 void
 545 parse_str_as_multibyte (str, len, nchars, nbytes)
 546      unsigned char *str;
 547      int len, *nchars, *nbytes;
 548 {
 549   unsigned char *endp = str + len;
 550   int n, chars = 0, bytes = 0;
 551
 552   if (len >= MAX_MULTIBYTE_LENGTH)
 553     {
 554       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 555       while (str < adjusted_endp)
 556         {
 557           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 558             str += n, bytes += n;
 559           else
 560             str++, bytes += 2;
 561           chars++;
 562         }
 563     }
 564   while (str < endp)
 565     {
 566       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 567         str += n, bytes += n;
 568       else
 569         str++, bytes += 2;
 570       chars++;
 571     }
 572
 573   *nchars = chars;
 574   *nbytes = bytes;
 575   return;
 576 }
 577
 578 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 579    It actually converts only such 8-bit characters that don't contruct
 580    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 581    NCHARS is nonzero, set *NCHARS to the number of characters in the
 582    text.  It is assured that we can use LEN bytes at STR as a work
 583    area and that is enough.  Return the number of bytes of the
 584    resulting text.  */
 585
 586 int
 587 str_as_multibyte (str, len, nbytes, nchars)
 588      unsigned char *str;
 589      int len, nbytes, *nchars;
 590 {
 591   unsigned char *p = str, *endp = str + nbytes;
 592   unsigned char *to;
 593   int chars = 0;
 594   int n;
 595
 596   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 597     {
 598       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 599       while (p < adjusted_endp
 600              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 601         p += n, chars++;
 602     }
 603   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 604     p += n, chars++;
 605   if (nchars)
 606     *nchars = chars;
 607   if (p == endp)
 608     return nbytes;
 609
 610   to = p;
 611   nbytes = endp - p;
 612   endp = str + len;
 613   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 614   p = endp - nbytes;
 615
 616   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 617     {
 618       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 619       while (p < adjusted_endp)
 620         {
 621           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 622             {
 623               while (n--)
 624                 *to++ = *p++;
 625             }
 626           else
 627             {
 628               int c = *p++;
 629               c = BYTE8_TO_CHAR (c);
 630               to += CHAR_STRING (c, to);
 631             }
 632         }
 633       chars++;
 634     }
 635   while (p < endp)
 636     {
 637       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 638         {
 639           while (n--)
 640             *to++ = *p++;
 641         }
 642       else
 643         {
 644           int c = *p++;
 645           c = BYTE8_TO_CHAR (c);
 646           to += CHAR_STRING (c, to);
 647         }
 648       chars++;
 649     }
 650   if (nchars)
 651     *nchars = chars;
 652   return (to - str);
 653 }
 654
 655 /* Parse unibyte string at STR of LEN bytes, and return the number of
 656    bytes it may ocupy when converted to multibyte string by
 657    `str_to_multibyte'.  */
 658
 659 int
 660 parse_str_to_multibyte (str, len)
 661      unsigned char *str;
 662      int len;
 663 {
 664   unsigned char *endp = str + len;
 665   int bytes;
 666
 667   for (bytes = 0; str < endp; str++)
 668     bytes += (*str < 0x80) ? 1 : 2;
 669   return bytes;
 670 }
 671
 672
 673 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 674    that contains the same single-byte characters.  It actually
 675    converts all 8-bit characters to multibyte forms.  It is assured
 676    that we can use LEN bytes at STR as a work area and that is
 677    enough.  */
 678
 679 int
 680 str_to_multibyte (str, len, bytes)
 681      unsigned char *str;
 682      int len, bytes;
 683 {
 684   unsigned char *p = str, *endp = str + bytes;
 685   unsigned char *to;
 686
 687   while (p < endp && *p < 0x80) p++;
 688   if (p == endp)
 689     return bytes;
 690   to = p;
 691   bytes = endp - p;
 692   endp = str + len;
 693   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 694   p = endp - bytes;
 695   while (p < endp)
 696     {
 697       int c = *p++;
 698
 699       if (c >= 0x80)
 700         c = BYTE8_TO_CHAR (c);
 701       to += CHAR_STRING (c, to);
 702     }
 703   return (to - str);
 704 }
 705
 706 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 707    actually converts characters in the range 0x80..0xFF to
 708    unibyte.  */
 709
 710 int
 711 str_as_unibyte (str, bytes)
 712      unsigned char *str;
 713      int bytes;
 714 {
 715   const unsigned char *p = str, *endp = str + bytes;
 716   unsigned char *to;
 717   int c, len;
 718
 719   while (p < endp)
 720     {
 721       c = *p;
 722       len = BYTES_BY_CHAR_HEAD (c);
 723       if (CHAR_BYTE8_HEAD_P (c))
 724         break;
 725       p += len;
 726     }
 727   to = str + (p - str);
 728   while (p < endp)
 729     {
 730       c = *p;
 731       len = BYTES_BY_CHAR_HEAD (c);
 732       if (CHAR_BYTE8_HEAD_P (c))
 733         {
 734           c = STRING_CHAR_ADVANCE (p);
 735           *to++ = CHAR_TO_BYTE8 (c);
 736         }
 737       else
 738         {
 739           while (len--) *to++ = *p++;
 740         }
 741     }
 742   return (to - str);
 743 }
 744
 745 int
 746 string_count_byte8 (string)
 747      Lisp_Object string;
 748 {
 749   int multibyte = STRING_MULTIBYTE (string);
 750   int nbytes = STRING_BYTES (XSTRING (string));
 751   unsigned char *p = XSTRING (string)->data;
 752   unsigned char *pend = p + nbytes;
 753   int count = 0;
 754   int c, len;
 755
 756   if (multibyte)
 757     while (p < pend)
 758       {
 759         c = *p;
 760         len = BYTES_BY_CHAR_HEAD (c);
 761
 762         if (CHAR_BYTE8_HEAD_P (c))
 763           count++;
 764         p += len;
 765       }
 766   else
 767     while (p < pend)
 768       {
 769         if (*p++ >= 0x80)
 770           count++;
 771       }
 772   return count;
 773 }
 774
 775
 776 Lisp_Object
 777 string_escape_byte8 (string)
 778      Lisp_Object string;
 779 {
 780   int nchars = XSTRING (string)->size;
 781   int nbytes = STRING_BYTES (XSTRING (string));
 782   int multibyte = STRING_MULTIBYTE (string);
 783   int byte8_count;
 784   const unsigned char *src, *src_end;
 785   unsigned char *dst;
 786   Lisp_Object val;
 787   int c, len;
 788
 789   if (multibyte && nchars == nbytes)
 790     return string;
 791
 792   byte8_count = string_count_byte8 (string);
 793
 794   if (byte8_count == 0)
 795     return string;
 796
 797   if (multibyte)
 798     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 799     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 800                                         nbytes + byte8_count * 2);
 801   else
 802     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 803     val = make_uninit_string (nbytes + byte8_count * 3);
 804
 805   src = XSTRING (string)->data;
 806   src_end = src + nbytes;
 807   dst = XSTRING (val)->data;
 808   if (multibyte)
 809     while (src < src_end)
 810       {
 811         c = *src;
 812         len = BYTES_BY_CHAR_HEAD (c);
 813
 814         if (CHAR_BYTE8_HEAD_P (c))
 815           {
 816             c = STRING_CHAR_ADVANCE (src);
 817             c = CHAR_TO_BYTE8 (c);
 818             sprintf ((char *) dst, "\\%03o", c);
 819             dst += 4;
 820           }
 821         else
 822           while (len--) *dst++ = *src++;
 823       }
 824   else
 825     while (src < src_end)
 826       {
 827         c = *src++;
 828         if (c >= 0x80)
 829           {
 830             sprintf ((char *) dst, "\\%03o", c);
 831             dst += 4;
 832           }
 833         else
 834           *dst++ = c;
 835       }
 836   return val;
 837 }
 838
 839 \f
 840 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
 841        doc: /*
 842 Concatenate all the argument characters and make the result a string.
 843 usage: (string &rest CHARACTERS)  */)
 844      (n, args)
 845      int n;
 846      Lisp_Object *args;
 847 {
 848   int i;
 849   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 850   unsigned char *p = buf;
 851   int c;
 852
 853   for (i = 0; i < n; i++)
 854     {
 855       CHECK_CHARACTER (args[i]);
 856       c = XINT (args[i]);
 857       p += CHAR_STRING (c, p);
 858     }
 859
 860   return make_string_from_bytes ((char *) buf, n, p - buf);
 861 }
 862
 863 void
 864 init_character_once ()
 865 {
 866 }
 867
 868 #ifdef emacs
 869
 870 void
 871 syms_of_character ()
 872 {
 873   DEFSYM (Qcharacterp, "characterp");
 874   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 875
 876   staticpro (&Vchar_unify_table);
 877   Vchar_unify_table = Qnil;
 878
 879   defsubr (&Smax_char);
 880   defsubr (&Scharacterp);
 881   defsubr (&Sunibyte_char_to_multibyte);
 882   defsubr (&Smultibyte_char_to_unibyte);
 883   defsubr (&Schar_bytes);
 884   defsubr (&Schar_width);
 885   defsubr (&Sstring_width);
 886   defsubr (&Schar_direction);
 887   defsubr (&Schars_in_region);
 888   defsubr (&Sstring);
 889
 890   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 891                doc: /*
 892 Vector recording all translation tables ever defined.
 893 Each element is a pair (SYMBOL . TABLE) relating the table to the
 894 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 895   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 896
 897   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 898                doc: /*
 899 A char-table for characters which invoke auto-filling.
 900 Such characters have value t in this table.  */);
 901   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 902   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 903   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 904
 905   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
 906                doc: /*
 907 A char-table for width (columns) of each character.  */);
 908   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
 909   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
 910   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
 911                         make_number (4));
 912
 913   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
 914                doc: /* A char-table for direction of each character.  */);
 915   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
 916
 917   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
 918                doc: /* A char-table for each printable character.  */);
 919   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
 920
 921   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
 922                doc: /* Char table of script symbols.
 923 It has one extra slot whose value is a list of script symbols.  */);
 924
 925   /* Intern this now in case it isn't already done.
 926      Setting this variable twice is harmless.
 927      But don't staticpro it here--that is done in alloc.c.  */
 928   Qchar_table_extra_slots = intern ("char-table-extra-slots");
 929   DEFSYM (Qchar_script_table, "char-script-table");
 930   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
 931   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 932 }
 933
 934 #endif /* emacs */