code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2005 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 static Lisp_Object Qchar_script_table;
  85
  86 /* Mapping table from unibyte chars to multibyte chars.  */
  87 int unibyte_to_multibyte_table[256];
  88
  89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  90    char.  */
  91 char unibyte_has_multibyte_table[256];
  92
  93 \f
  94
  95 /* Store multibyte form of character C at P.  If C has modifier bits,
  96    handle them appropriately.  */
  97
  98 int
  99 char_string (c, p)
 100      int c;
 101      unsigned char *p;
 102 {
 103   int bytes;
 104
 105   if (c & CHAR_MODIFIER_MASK)
 106     {
 107       /* As an non-ASCII character can't have modifier bits, we just
 108          ignore the bits.  */
 109       if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 110         {
 111           /* For Meta, Shift, and Control modifiers, we need special care.  */
 112           if (c & CHAR_META)
 113             {
 114               /* Move the meta bit to the right place for a string.  */
 115               c = (c & ~CHAR_META) | 0x80;
 116             }
 117           if (c & CHAR_SHIFT)
 118             {
 119               /* Shift modifier is valid only with [A-Za-z].  */
 120               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 121                 c &= ~CHAR_SHIFT;
 122               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 123                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 124             }
 125           if (c & CHAR_CTL)
 126             {
 127               /* Simulate the code in lread.c.  */
 128               /* Allow `\C- ' and `\C-?'.  */
 129               if (c == (CHAR_CTL | ' '))
 130                 c = 0;
 131               else if (c == (CHAR_CTL | '?'))
 132                 c = 127;
 133               /* ASCII control chars are made from letters (both cases),
 134                  as well as the non-letters within 0100...0137.  */
 135               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 136                 c &= (037 | (~0177 & ~CHAR_CTL));
 137               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 138                 c &= (037 | (~0177 & ~CHAR_CTL));
 139             }
 140         }
 141
 142       /* If C still has any modifier bits, just ignore it.  */
 143       c &= ~CHAR_MODIFIER_MASK;
 144     }
 145
 146   MAYBE_UNIFY_CHAR (c);
 147
 148   if (c <= MAX_3_BYTE_CHAR)
 149     {
 150       bytes = CHAR_STRING (c, p);
 151     }
 152   else if (c <= MAX_4_BYTE_CHAR)
 153     {
 154       p[0] = (0xF0 | (c >> 18));
 155       p[1] = (0x80 | ((c >> 12) & 0x3F));
 156       p[2] = (0x80 | ((c >> 6) & 0x3F));
 157       p[3] = (0x80 | (c & 0x3F));
 158       bytes = 4;
 159     }
 160   else if (c <= MAX_5_BYTE_CHAR)
 161     {
 162       p[0] = 0xF8;
 163       p[1] = (0x80 | ((c >> 18) & 0x0F));
 164       p[2] = (0x80 | ((c >> 12) & 0x3F));
 165       p[3] = (0x80 | ((c >> 6) & 0x3F));
 166       p[4] = (0x80 | (c & 0x3F));
 167       bytes = 5;
 168     }
 169   else
 170     {
 171       c = CHAR_TO_BYTE8 (c);
 172       bytes = BYTE8_STRING (c, p);
 173     }
 174
 175   return bytes;
 176 }
 177
 178
 179 /* Return a character whose multibyte form is at P.  Set LEN is not
 180    NULL, it must be a pointer to integer.  In that case, set *LEN to
 181    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 182    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 183    the ending address (i.e. the starting address of the next
 184    character) of the multibyte form.  */
 185
 186 int
 187 string_char (p, advanced, len)
 188      const unsigned char *p;
 189      const unsigned char **advanced;
 190      int *len;
 191 {
 192   int c;
 193   const unsigned char *saved_p = p;
 194
 195   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 196     {
 197       c = STRING_CHAR_ADVANCE (p);
 198     }
 199   else if (! (*p & 0x08))
 200     {
 201       c = ((((p)[0] & 0xF) << 18)
 202            | (((p)[1] & 0x3F) << 12)
 203            | (((p)[2] & 0x3F) << 6)
 204            | ((p)[3] & 0x3F));
 205       p += 4;
 206     }
 207   else
 208     {
 209       c = ((((p)[1] & 0x3F) << 18)
 210            | (((p)[2] & 0x3F) << 12)
 211            | (((p)[3] & 0x3F) << 6)
 212            | ((p)[4] & 0x3F));
 213       p += 5;
 214     }
 215
 216   MAYBE_UNIFY_CHAR (c);
 217
 218   if (len)
 219     *len = p - saved_p;
 220   if (advanced)
 221     *advanced = p;
 222   return c;
 223 }
 224
 225
 226 /* Translate character C by translation table TABLE.  If C is
 227    negative, translate a character specified by CHARSET and CODE.  If
 228    no translation is found in TABLE, return the untranslated
 229    character.  If TABLE is a list, elements are char tables.  In this
 230    case, translace C by all tables.  */
 231
 232 int
 233 translate_char (table, c)
 234      Lisp_Object table;
 235      int c;
 236 {
 237   if (CHAR_TABLE_P (table))
 238     {
 239       Lisp_Object ch;
 240
 241       ch = CHAR_TABLE_REF (table, c);
 242       if (CHARACTERP (ch))
 243         c = XINT (ch);
 244     }
 245   else
 246     {
 247       for (; CONSP (table); table = XCDR (table))
 248         c = translate_char (XCAR (table), c);
 249     }
 250   return c;
 251 }
 252
 253 /* Convert the multibyte character C to unibyte 8-bit character based
 254    on the current value of charset_unibyte.  If dimension of
 255    charset_unibyte is more than one, return (C & 0xFF).
 256
 257    The argument REV_TBL is now ignored.  It will be removed in the
 258    future.  */
 259
 260 int
 261 multibyte_char_to_unibyte (c, rev_tbl)
 262      int c;
 263      Lisp_Object rev_tbl;
 264 {
 265   struct charset *charset;
 266   unsigned c1;
 267
 268   if (CHAR_BYTE8_P (c))
 269     return CHAR_TO_BYTE8 (c);
 270   charset = CHARSET_FROM_ID (charset_unibyte);
 271   c1 = ENCODE_CHAR (charset, c);
 272   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 273 }
 274
 275
 276 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 277        doc: /* Return non-nil if OBJECT is a character.  */)
 278      (object, ignore)
 279      Lisp_Object object, ignore;
 280 {
 281   return (CHARACTERP (object) ? Qt : Qnil);
 282 }
 283
 284 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 285        doc: /* Return the character of the maximum code.  */)
 286      ()
 287 {
 288   return make_number (MAX_CHAR);
 289 }
 290
 291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 292        Sunibyte_char_to_multibyte, 1, 1, 0,
 293        doc: /* Convert the unibyte character CH to multibyte character.
 294 The multibyte character is a result of decoding CH by
 295 the current unibyte charset (see `unibyte-charset').  */)
 296      (ch)
 297      Lisp_Object ch;
 298 {
 299   int c;
 300   struct charset *charset;
 301
 302   CHECK_CHARACTER (ch);
 303   c = XFASTINT (ch);
 304   if (c >= 0400)
 305     error ("Invalid unibyte character: %d", c);
 306   charset = CHARSET_FROM_ID (charset_unibyte);
 307   c = DECODE_CHAR (charset, c);
 308   if (c < 0)
 309     c = BYTE8_TO_CHAR (XFASTINT (ch));
 310   return make_number (c);
 311 }
 312
 313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 314        Smultibyte_char_to_unibyte, 1, 1, 0,
 315        doc: /* Convert the multibyte character CH to unibyte character.\n\
 316 The unibyte character is a result of encoding CH by
 317 the current primary charset (value of `charset-primary').  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   c = CHAR_TO_BYTE8 (c);
 326   return make_number (c);
 327 }
 328
 329 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 330        doc: /* Return 1 regardless of the argument CHAR.
 331 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 332      (ch)
 333      Lisp_Object ch;
 334 {
 335   CHECK_CHARACTER (ch);
 336   return make_number (1);
 337 }
 338
 339 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 340        doc: /* Return width of CHAR when displayed in the current buffer.
 341 The width is measured by how many columns it occupies on the screen.
 342 Tab is taken to occupy `tab-width' columns.  */)
 343      (ch)
 344        Lisp_Object ch;
 345 {
 346   Lisp_Object disp;
 347   int c, width;
 348   struct Lisp_Char_Table *dp = buffer_display_table ();
 349
 350   CHECK_CHARACTER (ch);
 351   c = XINT (ch);
 352
 353   /* Get the way the display table would display it.  */
 354   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 355
 356   if (VECTORP (disp))
 357     width = ASIZE (disp);
 358   else
 359     width = CHAR_WIDTH (c);
 360
 361   return make_number (width);
 362 }
 363
 364 /* Return width of string STR of length LEN when displayed in the
 365    current buffer.  The width is measured by how many columns it
 366    occupies on the screen.  If PRECISION > 0, return the width of
 367    longest substring that doesn't exceed PRECISION, and set number of
 368    characters and bytes of the substring in *NCHARS and *NBYTES
 369    respectively.  */
 370
 371 int
 372 c_string_width (str, len, precision, nchars, nbytes)
 373      const unsigned char *str;
 374      int precision, *nchars, *nbytes;
 375 {
 376   int i = 0, i_byte = 0;
 377   int width = 0;
 378   struct Lisp_Char_Table *dp = buffer_display_table ();
 379
 380   while (i_byte < len)
 381     {
 382       int bytes, thiswidth;
 383       Lisp_Object val;
 384       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 385
 386       if (dp)
 387         {
 388           val = DISP_CHAR_VECTOR (dp, c);
 389           if (VECTORP (val))
 390             thiswidth = XVECTOR (val)->size;
 391           else
 392             thiswidth = CHAR_WIDTH (c);
 393         }
 394       else
 395         {
 396           thiswidth = CHAR_WIDTH (c);
 397         }
 398
 399       if (precision > 0
 400           && (width + thiswidth > precision))
 401         {
 402           *nchars = i;
 403           *nbytes = i_byte;
 404           return width;
 405         }
 406       i++;
 407       i_byte += bytes;
 408       width += thiswidth;
 409   }
 410
 411   if (precision > 0)
 412     {
 413       *nchars = i;
 414       *nbytes = i_byte;
 415     }
 416
 417   return width;
 418 }
 419
 420 /* Return width of string STR of length LEN when displayed in the
 421    current buffer.  The width is measured by how many columns it
 422    occupies on the screen.  */
 423
 424 int
 425 strwidth (str, len)
 426      unsigned char *str;
 427      int len;
 428 {
 429   return c_string_width (str, len, -1, NULL, NULL);
 430 }
 431
 432 /* Return width of Lisp string STRING when displayed in the current
 433    buffer.  The width is measured by how many columns it occupies on
 434    the screen while paying attention to compositions.  If PRECISION >
 435    0, return the width of longest substring that doesn't exceed
 436    PRECISION, and set number of characters and bytes of the substring
 437    in *NCHARS and *NBYTES respectively.  */
 438
 439 int
 440 lisp_string_width (string, precision, nchars, nbytes)
 441      Lisp_Object string;
 442      int precision, *nchars, *nbytes;
 443 {
 444   int len = SCHARS (string);
 445   /* This set multibyte to 0 even if STRING is multibyte when it
 446      contains only ascii and eight-bit-graphic, but that's
 447      intentional.  */
 448   int multibyte = len < SBYTES (string);
 449   unsigned char *str = SDATA (string);
 450   int i = 0, i_byte = 0;
 451   int width = 0;
 452   struct Lisp_Char_Table *dp = buffer_display_table ();
 453
 454   while (i < len)
 455     {
 456       int chars, bytes, thiswidth;
 457       Lisp_Object val;
 458       int cmp_id;
 459       EMACS_INT ignore, end;
 460
 461       if (find_composition (i, -1, &ignore, &end, &val, string)
 462           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 463               >= 0))
 464         {
 465           thiswidth = composition_table[cmp_id]->width;
 466           chars = end - i;
 467           bytes = string_char_to_byte (string, end) - i_byte;
 468         }
 469       else
 470         {
 471           int c;
 472
 473           if (multibyte)
 474             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 475           else
 476             c = str[i_byte], bytes = 1;
 477           chars = 1;
 478           if (dp)
 479             {
 480               val = DISP_CHAR_VECTOR (dp, c);
 481               if (VECTORP (val))
 482                 thiswidth = XVECTOR (val)->size;
 483               else
 484                 thiswidth = CHAR_WIDTH (c);
 485             }
 486           else
 487             {
 488               thiswidth = CHAR_WIDTH (c);
 489             }
 490         }
 491
 492       if (precision > 0
 493           && (width + thiswidth > precision))
 494         {
 495           *nchars = i;
 496           *nbytes = i_byte;
 497           return width;
 498         }
 499       i += chars;
 500       i_byte += bytes;
 501       width += thiswidth;
 502   }
 503
 504   if (precision > 0)
 505     {
 506       *nchars = i;
 507       *nbytes = i_byte;
 508     }
 509
 510   return width;
 511 }
 512
 513 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 514        doc: /* Return width of STRING when displayed in the current buffer.
 515 Width is measured by how many columns it occupies on the screen.
 516 When calculating width of a multibyte character in STRING,
 517 only the base leading-code is considered; the validity of
 518 the following bytes is not checked.  Tabs in STRING are always
 519 taken to occupy `tab-width' columns.  */)
 520      (str)
 521      Lisp_Object str;
 522 {
 523   Lisp_Object val;
 524
 525   CHECK_STRING (str);
 526   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 527   return val;
 528 }
 529
 530 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 531        doc: /* Return the direction of CHAR.
 532 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 533      (ch)
 534      Lisp_Object ch;
 535 {
 536   int c;
 537
 538   CHECK_CHARACTER (ch);
 539   c = XINT (ch);
 540   return CHAR_TABLE_REF (Vchar_direction_table, c);
 541 }
 542
 543 /* Return the number of characters in the NBYTES bytes at PTR.
 544    This works by looking at the contents and checking for multibyte
 545    sequences while assuming that there's no invalid sequence.
 546    However, if the current buffer has enable-multibyte-characters =
 547    nil, we treat each byte as a character.  */
 548
 549 int
 550 chars_in_text (ptr, nbytes)
 551      const unsigned char *ptr;
 552      int nbytes;
 553 {
 554   /* current_buffer is null at early stages of Emacs initialization.  */
 555   if (current_buffer == 0
 556       || NILP (current_buffer->enable_multibyte_characters))
 557     return nbytes;
 558
 559   return multibyte_chars_in_text (ptr, nbytes);
 560 }
 561
 562 /* Return the number of characters in the NBYTES bytes at PTR.
 563    This works by looking at the contents and checking for multibyte
 564    sequences while assuming that there's no invalid sequence.  It
 565    ignores enable-multibyte-characters.  */
 566
 567 int
 568 multibyte_chars_in_text (ptr, nbytes)
 569      const unsigned char *ptr;
 570      int nbytes;
 571 {
 572   const unsigned char *endp = ptr + nbytes;
 573   int chars = 0;
 574
 575   while (ptr < endp)
 576     {
 577       int len = MULTIBYTE_LENGTH (ptr, endp);
 578
 579       if (len == 0)
 580         abort ();
 581       ptr += len;
 582       chars++;
 583     }
 584
 585   return chars;
 586 }
 587
 588 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 589    characters and bytes in it, and store them in *NCHARS and *NBYTES
 590    respectively.  On counting bytes, pay attention to that 8-bit
 591    characters not constructing a valid multibyte sequence are
 592    represented by 2-byte in a multibyte text.  */
 593
 594 void
 595 parse_str_as_multibyte (str, len, nchars, nbytes)
 596      const unsigned char *str;
 597      int len, *nchars, *nbytes;
 598 {
 599   const unsigned char *endp = str + len;
 600   int n, chars = 0, bytes = 0;
 601
 602   if (len >= MAX_MULTIBYTE_LENGTH)
 603     {
 604       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 605       while (str < adjusted_endp)
 606         {
 607           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 608             str += n, bytes += n;
 609           else
 610             str++, bytes += 2;
 611           chars++;
 612         }
 613     }
 614   while (str < endp)
 615     {
 616       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 617         str += n, bytes += n;
 618       else
 619         str++, bytes += 2;
 620       chars++;
 621     }
 622
 623   *nchars = chars;
 624   *nbytes = bytes;
 625   return;
 626 }
 627
 628 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 629    It actually converts only such 8-bit characters that don't contruct
 630    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 631    NCHARS is nonzero, set *NCHARS to the number of characters in the
 632    text.  It is assured that we can use LEN bytes at STR as a work
 633    area and that is enough.  Return the number of bytes of the
 634    resulting text.  */
 635
 636 int
 637 str_as_multibyte (str, len, nbytes, nchars)
 638      unsigned char *str;
 639      int len, nbytes, *nchars;
 640 {
 641   unsigned char *p = str, *endp = str + nbytes;
 642   unsigned char *to;
 643   int chars = 0;
 644   int n;
 645
 646   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 647     {
 648       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 649       while (p < adjusted_endp
 650              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 651         p += n, chars++;
 652     }
 653   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 654     p += n, chars++;
 655   if (nchars)
 656     *nchars = chars;
 657   if (p == endp)
 658     return nbytes;
 659
 660   to = p;
 661   nbytes = endp - p;
 662   endp = str + len;
 663   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 664   p = endp - nbytes;
 665
 666   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 667     {
 668       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 669       while (p < adjusted_endp)
 670         {
 671           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 672             {
 673               while (n--)
 674                 *to++ = *p++;
 675             }
 676           else
 677             {
 678               int c = *p++;
 679               c = BYTE8_TO_CHAR (c);
 680               to += CHAR_STRING (c, to);
 681             }
 682         }
 683       chars++;
 684     }
 685   while (p < endp)
 686     {
 687       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 688         {
 689           while (n--)
 690             *to++ = *p++;
 691         }
 692       else
 693         {
 694           int c = *p++;
 695           c = BYTE8_TO_CHAR (c);
 696           to += CHAR_STRING (c, to);
 697         }
 698       chars++;
 699     }
 700   if (nchars)
 701     *nchars = chars;
 702   return (to - str);
 703 }
 704
 705 /* Parse unibyte string at STR of LEN bytes, and return the number of
 706    bytes it may ocupy when converted to multibyte string by
 707    `str_to_multibyte'.  */
 708
 709 int
 710 parse_str_to_multibyte (str, len)
 711      unsigned char *str;
 712      int len;
 713 {
 714   unsigned char *endp = str + len;
 715   int bytes;
 716
 717   for (bytes = 0; str < endp; str++)
 718     bytes += (*str < 0x80) ? 1 : 2;
 719   return bytes;
 720 }
 721
 722
 723 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 724    that contains the same single-byte characters.  It actually
 725    converts all 8-bit characters to multibyte forms.  It is assured
 726    that we can use LEN bytes at STR as a work area and that is
 727    enough.  */
 728
 729 int
 730 str_to_multibyte (str, len, bytes)
 731      unsigned char *str;
 732      int len, bytes;
 733 {
 734   unsigned char *p = str, *endp = str + bytes;
 735   unsigned char *to;
 736
 737   while (p < endp && *p < 0x80) p++;
 738   if (p == endp)
 739     return bytes;
 740   to = p;
 741   bytes = endp - p;
 742   endp = str + len;
 743   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 744   p = endp - bytes;
 745   while (p < endp)
 746     {
 747       int c = *p++;
 748
 749       if (c >= 0x80)
 750         c = BYTE8_TO_CHAR (c);
 751       to += CHAR_STRING (c, to);
 752     }
 753   return (to - str);
 754 }
 755
 756 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 757    actually converts characters in the range 0x80..0xFF to
 758    unibyte.  */
 759
 760 int
 761 str_as_unibyte (str, bytes)
 762      unsigned char *str;
 763      int bytes;
 764 {
 765   const unsigned char *p = str, *endp = str + bytes;
 766   unsigned char *to;
 767   int c, len;
 768
 769   while (p < endp)
 770     {
 771       c = *p;
 772       len = BYTES_BY_CHAR_HEAD (c);
 773       if (CHAR_BYTE8_HEAD_P (c))
 774         break;
 775       p += len;
 776     }
 777   to = str + (p - str);
 778   while (p < endp)
 779     {
 780       c = *p;
 781       len = BYTES_BY_CHAR_HEAD (c);
 782       if (CHAR_BYTE8_HEAD_P (c))
 783         {
 784           c = STRING_CHAR_ADVANCE (p);
 785           *to++ = CHAR_TO_BYTE8 (c);
 786         }
 787       else
 788         {
 789           while (len--) *to++ = *p++;
 790         }
 791     }
 792   return (to - str);
 793 }
 794
 795 int
 796 string_count_byte8 (string)
 797      Lisp_Object string;
 798 {
 799   int multibyte = STRING_MULTIBYTE (string);
 800   int nbytes = SBYTES (string);
 801   unsigned char *p = SDATA (string);
 802   unsigned char *pend = p + nbytes;
 803   int count = 0;
 804   int c, len;
 805
 806   if (multibyte)
 807     while (p < pend)
 808       {
 809         c = *p;
 810         len = BYTES_BY_CHAR_HEAD (c);
 811
 812         if (CHAR_BYTE8_HEAD_P (c))
 813           count++;
 814         p += len;
 815       }
 816   else
 817     while (p < pend)
 818       {
 819         if (*p++ >= 0x80)
 820           count++;
 821       }
 822   return count;
 823 }
 824
 825
 826 Lisp_Object
 827 string_escape_byte8 (string)
 828      Lisp_Object string;
 829 {
 830   int nchars = SCHARS (string);
 831   int nbytes = SBYTES (string);
 832   int multibyte = STRING_MULTIBYTE (string);
 833   int byte8_count;
 834   const unsigned char *src, *src_end;
 835   unsigned char *dst;
 836   Lisp_Object val;
 837   int c, len;
 838
 839   if (multibyte && nchars == nbytes)
 840     return string;
 841
 842   byte8_count = string_count_byte8 (string);
 843
 844   if (byte8_count == 0)
 845     return string;
 846
 847   if (multibyte)
 848     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 849     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 850                                         nbytes + byte8_count * 2);
 851   else
 852     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 853     val = make_uninit_string (nbytes + byte8_count * 3);
 854
 855   src = SDATA (string);
 856   src_end = src + nbytes;
 857   dst = SDATA (val);
 858   if (multibyte)
 859     while (src < src_end)
 860       {
 861         c = *src;
 862         len = BYTES_BY_CHAR_HEAD (c);
 863
 864         if (CHAR_BYTE8_HEAD_P (c))
 865           {
 866             c = STRING_CHAR_ADVANCE (src);
 867             c = CHAR_TO_BYTE8 (c);
 868             sprintf ((char *) dst, "\\%03o", c);
 869             dst += 4;
 870           }
 871         else
 872           while (len--) *dst++ = *src++;
 873       }
 874   else
 875     while (src < src_end)
 876       {
 877         c = *src++;
 878         if (c >= 0x80)
 879           {
 880             sprintf ((char *) dst, "\\%03o", c);
 881             dst += 4;
 882           }
 883         else
 884           *dst++ = c;
 885       }
 886   return val;
 887 }
 888
 889 \f
 890 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 891        doc: /*
 892 Concatenate all the argument characters and make the result a string.
 893 usage: (string &rest CHARACTERS)  */)
 894      (n, args)
 895      int n;
 896      Lisp_Object *args;
 897 {
 898   int i;
 899   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 900   unsigned char *p = buf;
 901   int c;
 902
 903   for (i = 0; i < n; i++)
 904     {
 905       CHECK_CHARACTER (args[i]);
 906       c = XINT (args[i]);
 907       p += CHAR_STRING (c, p);
 908     }
 909
 910   return make_string_from_bytes ((char *) buf, n, p - buf);
 911 }
 912
 913 void
 914 init_character_once ()
 915 {
 916 }
 917
 918 #ifdef emacs
 919
 920 void
 921 syms_of_character ()
 922 {
 923   DEFSYM (Qcharacterp, "characterp");
 924   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 925
 926   staticpro (&Vchar_unify_table);
 927   Vchar_unify_table = Qnil;
 928
 929   defsubr (&Smax_char);
 930   defsubr (&Scharacterp);
 931   defsubr (&Sunibyte_char_to_multibyte);
 932   defsubr (&Smultibyte_char_to_unibyte);
 933   defsubr (&Schar_bytes);
 934   defsubr (&Schar_width);
 935   defsubr (&Sstring_width);
 936   defsubr (&Schar_direction);
 937   defsubr (&Sstring);
 938
 939   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 940                doc: /*
 941 Vector recording all translation tables ever defined.
 942 Each element is a pair (SYMBOL . TABLE) relating the table to the
 943 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 944   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 945
 946   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 947                doc: /*
 948 A char-table for characters which invoke auto-filling.
 949 Such characters have value t in this table.  */);
 950   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 951   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 952   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 953
 954   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
 955                doc: /*
 956 A char-table for width (columns) of each character.  */);
 957   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
 958   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
 959   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
 960                         make_number (4));
 961
 962   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
 963                doc: /* A char-table for direction of each character.  */);
 964   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
 965
 966   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
 967                doc: /* A char-table for each printable character.  */);
 968   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
 969   Fset_char_table_range (Vprintable_chars,
 970                          Fcons (make_number (32), make_number (126)), Qt);
 971   Fset_char_table_range (Vprintable_chars,
 972                          Fcons (make_number (160),
 973                                 make_number (MAX_5_BYTE_CHAR)), Qt);
 974
 975   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
 976                doc: /* Char table of script symbols.
 977 It has one extra slot whose value is a list of script symbols.  */);
 978
 979   /* Intern this now in case it isn't already done.
 980      Setting this variable twice is harmless.
 981      But don't staticpro it here--that is done in alloc.c.  */
 982   Qchar_table_extra_slots = intern ("char-table-extra-slots");
 983   DEFSYM (Qchar_script_table, "char-script-table");
 984   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
 985   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 986 }
 987
 988 #endif /* emacs */
 989
 990 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
 991    (do not change this comment) */