code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2005 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 static Lisp_Object Qchar_script_table;
  85
  86 /* Mapping table from unibyte chars to multibyte chars.  */
  87 int unibyte_to_multibyte_table[256];
  88
  89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  90    char.  */
  91 char unibyte_has_multibyte_table[256];
  92
  93 \f
  94
  95 /* Store multibyte form of character C at P.  If C has modifier bits,
  96    handle them appropriately.  */
  97
  98 int
  99 char_string (c, p)
 100      int c;
 101      unsigned char *p;
 102 {
 103   int bytes;
 104
 105   if (c & CHAR_MODIFIER_MASK)
 106     {
 107       /* As an non-ASCII character can't have modifier bits, we just
 108          ignore the bits.  */
 109       if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 110         {
 111           /* For Meta, Shift, and Control modifiers, we need special care.  */
 112           if (c & CHAR_META)
 113             {
 114               /* Move the meta bit to the right place for a string.  */
 115               c = (c & ~CHAR_META) | 0x80;
 116             }
 117           if (c & CHAR_SHIFT)
 118             {
 119               /* Shift modifier is valid only with [A-Za-z].  */
 120               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 121                 c &= ~CHAR_SHIFT;
 122               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 123                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 124             }
 125           if (c & CHAR_CTL)
 126             {
 127               /* Simulate the code in lread.c.  */
 128               /* Allow `\C- ' and `\C-?'.  */
 129               if (c == (CHAR_CTL | ' '))
 130                 c = 0;
 131               else if (c == (CHAR_CTL | '?'))
 132                 c = 127;
 133               /* ASCII control chars are made from letters (both cases),
 134                  as well as the non-letters within 0100...0137.  */
 135               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 136                 c &= (037 | (~0177 & ~CHAR_CTL));
 137               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 138                 c &= (037 | (~0177 & ~CHAR_CTL));
 139             }
 140         }
 141
 142       /* If C still has any modifier bits, just ignore it.  */
 143       c &= ~CHAR_MODIFIER_MASK;
 144     }
 145
 146   MAYBE_UNIFY_CHAR (c);
 147
 148   if (c <= MAX_3_BYTE_CHAR)
 149     {
 150       bytes = CHAR_STRING (c, p);
 151     }
 152   else if (c <= MAX_4_BYTE_CHAR)
 153     {
 154       p[0] = (0xF0 | (c >> 18));
 155       p[1] = (0x80 | ((c >> 12) & 0x3F));
 156       p[2] = (0x80 | ((c >> 6) & 0x3F));
 157       p[3] = (0x80 | (c & 0x3F));
 158       bytes = 4;
 159     }
 160   else if (c <= MAX_5_BYTE_CHAR)
 161     {
 162       p[0] = 0xF8;
 163       p[1] = (0x80 | ((c >> 18) & 0x0F));
 164       p[2] = (0x80 | ((c >> 12) & 0x3F));
 165       p[3] = (0x80 | ((c >> 6) & 0x3F));
 166       p[4] = (0x80 | (c & 0x3F));
 167       bytes = 5;
 168     }
 169   else
 170     {
 171       c = CHAR_TO_BYTE8 (c);
 172       bytes = BYTE8_STRING (c, p);
 173     }
 174
 175   return bytes;
 176 }
 177
 178
 179 /* Return a character whose multibyte form is at P.  Set LEN is not
 180    NULL, it must be a pointer to integer.  In that case, set *LEN to
 181    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 182    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 183    the ending address (i.e. the starting address of the next
 184    character) of the multibyte form.  */
 185
 186 int
 187 string_char (p, advanced, len)
 188      const unsigned char *p;
 189      const unsigned char **advanced;
 190      int *len;
 191 {
 192   int c;
 193   const unsigned char *saved_p = p;
 194
 195   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 196     {
 197       c = STRING_CHAR_ADVANCE (p);
 198     }
 199   else if (! (*p & 0x08))
 200     {
 201       c = ((((p)[0] & 0xF) << 18)
 202            | (((p)[1] & 0x3F) << 12)
 203            | (((p)[2] & 0x3F) << 6)
 204            | ((p)[3] & 0x3F));
 205       p += 4;
 206     }
 207   else
 208     {
 209       c = ((((p)[1] & 0x3F) << 18)
 210            | (((p)[2] & 0x3F) << 12)
 211            | (((p)[3] & 0x3F) << 6)
 212            | ((p)[4] & 0x3F));
 213       p += 5;
 214     }
 215
 216   MAYBE_UNIFY_CHAR (c);
 217
 218   if (len)
 219     *len = p - saved_p;
 220   if (advanced)
 221     *advanced = p;
 222   return c;
 223 }
 224
 225
 226 /* Translate character C by translation table TABLE.  If C is
 227    negative, translate a character specified by CHARSET and CODE.  If
 228    no translation is found in TABLE, return the untranslated
 229    character.  If TABLE is a list, elements are char tables.  In this
 230    case, translace C by all tables.  */
 231
 232 int
 233 translate_char (table, c)
 234      Lisp_Object table;
 235      int c;
 236 {
 237   if (CHAR_TABLE_P (table))
 238     {
 239       Lisp_Object ch;
 240
 241       ch = CHAR_TABLE_REF (table, c);
 242       if (CHARACTERP (ch))
 243         c = XINT (ch);
 244     }
 245   else
 246     {
 247       for (; CONSP (table); table = XCDR (table))
 248         c = translate_char (XCAR (table), c);
 249     }
 250   return c;
 251 }
 252
 253 /* Convert the multibyte character C to unibyte 8-bit character based
 254    on the current value of charset_unibyte.  If dimension of
 255    charset_unibyte is more than one, return (C & 0xFF).
 256
 257    The argument REV_TBL is now ignored.  It will be removed in the
 258    future.  */
 259
 260 int
 261 multibyte_char_to_unibyte (c, rev_tbl)
 262      int c;
 263      Lisp_Object rev_tbl;
 264 {
 265   struct charset *charset;
 266   unsigned c1;
 267
 268   if (CHAR_BYTE8_P (c))
 269     return CHAR_TO_BYTE8 (c);
 270   charset = CHARSET_FROM_ID (charset_unibyte);
 271   c1 = ENCODE_CHAR (charset, c);
 272   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 273 }
 274
 275
 276 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 277        doc: /* Return non-nil if OBJECT is a character.  */)
 278      (object, ignore)
 279      Lisp_Object object, ignore;
 280 {
 281   return (CHARACTERP (object) ? Qt : Qnil);
 282 }
 283
 284 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 285        doc: /* Return the character of the maximum code.  */)
 286      ()
 287 {
 288   return make_number (MAX_CHAR);
 289 }
 290
 291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 292        Sunibyte_char_to_multibyte, 1, 1, 0,
 293        doc: /* Convert the unibyte character CH to multibyte character.
 294 The multibyte character is a result of decoding CH by
 295 the current unibyte charset (see `unibyte-charset').  */)
 296      (ch)
 297      Lisp_Object ch;
 298 {
 299   int c;
 300   struct charset *charset;
 301
 302   CHECK_CHARACTER (ch);
 303   c = XFASTINT (ch);
 304   if (c >= 0400)
 305     error ("Invalid unibyte character: %d", c);
 306   charset = CHARSET_FROM_ID (charset_unibyte);
 307   c = DECODE_CHAR (charset, c);
 308   if (c < 0)
 309     c = BYTE8_TO_CHAR (XFASTINT (ch));
 310   return make_number (c);
 311 }
 312
 313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 314        Smultibyte_char_to_unibyte, 1, 1, 0,
 315        doc: /* Convert the multibyte character CH to unibyte character.\n\
 316 The unibyte character is a result of encoding CH by
 317 the current primary charset (value of `charset-primary').  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   c = CHAR_TO_BYTE8 (c);
 326   return make_number (c);
 327 }
 328
 329 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 330        doc: /* Return 1 regardless of the argument CHAR.
 331 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 332      (ch)
 333      Lisp_Object ch;
 334 {
 335   CHECK_CHARACTER (ch);
 336   return make_number (1);
 337 }
 338
 339 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 340        doc: /* Return width of CHAR when displayed in the current buffer.
 341 The width is measured by how many columns it occupies on the screen.
 342 Tab is taken to occupy `tab-width' columns.  */)
 343      (ch)
 344        Lisp_Object ch;
 345 {
 346   Lisp_Object disp;
 347   int c, width;
 348   struct Lisp_Char_Table *dp = buffer_display_table ();
 349
 350   CHECK_CHARACTER (ch);
 351   c = XINT (ch);
 352
 353   /* Get the way the display table would display it.  */
 354   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 355
 356   if (VECTORP (disp))
 357     width = ASIZE (disp);
 358   else
 359     width = CHAR_WIDTH (c);
 360
 361   return make_number (width);
 362 }
 363
 364 /* Return width of string STR of length LEN when displayed in the
 365    current buffer.  The width is measured by how many columns it
 366    occupies on the screen.  If PRECISION > 0, return the width of
 367    longest substring that doesn't exceed PRECISION, and set number of
 368    characters and bytes of the substring in *NCHARS and *NBYTES
 369    respectively.  */
 370
 371 int
 372 c_string_width (str, len, precision, nchars, nbytes)
 373      const unsigned char *str;
 374      int precision, *nchars, *nbytes;
 375 {
 376   int i = 0, i_byte = 0;
 377   int width = 0;
 378   struct Lisp_Char_Table *dp = buffer_display_table ();
 379
 380   while (i_byte < len)
 381     {
 382       int bytes, thiswidth;
 383       Lisp_Object val;
 384       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 385
 386       if (dp)
 387         {
 388           val = DISP_CHAR_VECTOR (dp, c);
 389           if (VECTORP (val))
 390             thiswidth = XVECTOR (val)->size;
 391           else
 392             thiswidth = CHAR_WIDTH (c);
 393         }
 394       else
 395         {
 396           thiswidth = CHAR_WIDTH (c);
 397         }
 398
 399       if (precision > 0
 400           && (width + thiswidth > precision))
 401         {
 402           *nchars = i;
 403           *nbytes = i_byte;
 404           return width;
 405         }
 406       i++;
 407       i_byte += bytes;
 408       width += thiswidth;
 409   }
 410
 411   if (precision > 0)
 412     {
 413       *nchars = i;
 414       *nbytes = i_byte;
 415     }
 416
 417   return width;
 418 }
 419
 420 /* Return width of string STR of length LEN when displayed in the
 421    current buffer.  The width is measured by how many columns it
 422    occupies on the screen.  */
 423
 424 int
 425 strwidth (str, len)
 426      unsigned char *str;
 427      int len;
 428 {
 429   return c_string_width (str, len, -1, NULL, NULL);
 430 }
 431
 432 /* Return width of Lisp string STRING when displayed in the current
 433    buffer.  The width is measured by how many columns it occupies on
 434    the screen while paying attention to compositions.  If PRECISION >
 435    0, return the width of longest substring that doesn't exceed
 436    PRECISION, and set number of characters and bytes of the substring
 437    in *NCHARS and *NBYTES respectively.  */
 438
 439 int
 440 lisp_string_width (string, precision, nchars, nbytes)
 441      Lisp_Object string;
 442      int precision, *nchars, *nbytes;
 443 {
 444   int len = SCHARS (string);
 445   unsigned char *str = SDATA (string);
 446   int i = 0, i_byte = 0;
 447   int width = 0;
 448   struct Lisp_Char_Table *dp = buffer_display_table ();
 449
 450   while (i < len)
 451     {
 452       int chars, bytes, thiswidth;
 453       Lisp_Object val;
 454       int cmp_id;
 455       EMACS_INT ignore, end;
 456
 457       if (find_composition (i, -1, &ignore, &end, &val, string)
 458           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 459               >= 0))
 460         {
 461           thiswidth = composition_table[cmp_id]->width;
 462           chars = end - i;
 463           bytes = string_char_to_byte (string, end) - i_byte;
 464         }
 465       else if (dp)
 466         {
 467           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 468
 469           chars = 1;
 470           val = DISP_CHAR_VECTOR (dp, c);
 471           if (VECTORP (val))
 472             thiswidth = XVECTOR (val)->size;
 473           else
 474             thiswidth = CHAR_WIDTH (c);
 475         }
 476       else
 477         {
 478           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 479
 480           chars = 1;
 481           thiswidth = CHAR_WIDTH (c);
 482         }
 483
 484       if (precision > 0
 485           && (width + thiswidth > precision))
 486         {
 487           *nchars = i;
 488           *nbytes = i_byte;
 489           return width;
 490         }
 491       i += chars;
 492       i_byte += bytes;
 493       width += thiswidth;
 494   }
 495
 496   if (precision > 0)
 497     {
 498       *nchars = i;
 499       *nbytes = i_byte;
 500     }
 501
 502   return width;
 503 }
 504
 505 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 506        doc: /* Return width of STRING when displayed in the current buffer.
 507 Width is measured by how many columns it occupies on the screen.
 508 When calculating width of a multibyte character in STRING,
 509 only the base leading-code is considered; the validity of
 510 the following bytes is not checked.  Tabs in STRING are always
 511 taken to occupy `tab-width' columns.  */)
 512      (str)
 513      Lisp_Object str;
 514 {
 515   Lisp_Object val;
 516
 517   CHECK_STRING (str);
 518   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 519   return val;
 520 }
 521
 522 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 523        doc: /* Return the direction of CHAR.
 524 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 525      (ch)
 526      Lisp_Object ch;
 527 {
 528   int c;
 529
 530   CHECK_CHARACTER (ch);
 531   c = XINT (ch);
 532   return CHAR_TABLE_REF (Vchar_direction_table, c);
 533 }
 534
 535 /* Return the number of characters in the NBYTES bytes at PTR.
 536    This works by looking at the contents and checking for multibyte
 537    sequences while assuming that there's no invalid sequence.
 538    However, if the current buffer has enable-multibyte-characters =
 539    nil, we treat each byte as a character.  */
 540
 541 int
 542 chars_in_text (ptr, nbytes)
 543      const unsigned char *ptr;
 544      int nbytes;
 545 {
 546   /* current_buffer is null at early stages of Emacs initialization.  */
 547   if (current_buffer == 0
 548       || NILP (current_buffer->enable_multibyte_characters))
 549     return nbytes;
 550
 551   return multibyte_chars_in_text (ptr, nbytes);
 552 }
 553
 554 /* Return the number of characters in the NBYTES bytes at PTR.
 555    This works by looking at the contents and checking for multibyte
 556    sequences while assuming that there's no invalid sequence.  It
 557    ignores enable-multibyte-characters.  */
 558
 559 int
 560 multibyte_chars_in_text (ptr, nbytes)
 561      const unsigned char *ptr;
 562      int nbytes;
 563 {
 564   const unsigned char *endp = ptr + nbytes;
 565   int chars = 0;
 566
 567   while (ptr < endp)
 568     {
 569       int len = MULTIBYTE_LENGTH (ptr, endp);
 570
 571       if (len == 0)
 572         abort ();
 573       ptr += len;
 574       chars++;
 575     }
 576
 577   return chars;
 578 }
 579
 580 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 581    characters and bytes in it, and store them in *NCHARS and *NBYTES
 582    respectively.  On counting bytes, pay attention to that 8-bit
 583    characters not constructing a valid multibyte sequence are
 584    represented by 2-byte in a multibyte text.  */
 585
 586 void
 587 parse_str_as_multibyte (str, len, nchars, nbytes)
 588      const unsigned char *str;
 589      int len, *nchars, *nbytes;
 590 {
 591   const unsigned char *endp = str + len;
 592   int n, chars = 0, bytes = 0;
 593
 594   if (len >= MAX_MULTIBYTE_LENGTH)
 595     {
 596       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 597       while (str < adjusted_endp)
 598         {
 599           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 600             str += n, bytes += n;
 601           else
 602             str++, bytes += 2;
 603           chars++;
 604         }
 605     }
 606   while (str < endp)
 607     {
 608       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 609         str += n, bytes += n;
 610       else
 611         str++, bytes += 2;
 612       chars++;
 613     }
 614
 615   *nchars = chars;
 616   *nbytes = bytes;
 617   return;
 618 }
 619
 620 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 621    It actually converts only such 8-bit characters that don't contruct
 622    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 623    NCHARS is nonzero, set *NCHARS to the number of characters in the
 624    text.  It is assured that we can use LEN bytes at STR as a work
 625    area and that is enough.  Return the number of bytes of the
 626    resulting text.  */
 627
 628 int
 629 str_as_multibyte (str, len, nbytes, nchars)
 630      unsigned char *str;
 631      int len, nbytes, *nchars;
 632 {
 633   unsigned char *p = str, *endp = str + nbytes;
 634   unsigned char *to;
 635   int chars = 0;
 636   int n;
 637
 638   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 639     {
 640       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 641       while (p < adjusted_endp
 642              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 643         p += n, chars++;
 644     }
 645   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 646     p += n, chars++;
 647   if (nchars)
 648     *nchars = chars;
 649   if (p == endp)
 650     return nbytes;
 651
 652   to = p;
 653   nbytes = endp - p;
 654   endp = str + len;
 655   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 656   p = endp - nbytes;
 657
 658   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 659     {
 660       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 661       while (p < adjusted_endp)
 662         {
 663           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 664             {
 665               while (n--)
 666                 *to++ = *p++;
 667             }
 668           else
 669             {
 670               int c = *p++;
 671               c = BYTE8_TO_CHAR (c);
 672               to += CHAR_STRING (c, to);
 673             }
 674         }
 675       chars++;
 676     }
 677   while (p < endp)
 678     {
 679       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 680         {
 681           while (n--)
 682             *to++ = *p++;
 683         }
 684       else
 685         {
 686           int c = *p++;
 687           c = BYTE8_TO_CHAR (c);
 688           to += CHAR_STRING (c, to);
 689         }
 690       chars++;
 691     }
 692   if (nchars)
 693     *nchars = chars;
 694   return (to - str);
 695 }
 696
 697 /* Parse unibyte string at STR of LEN bytes, and return the number of
 698    bytes it may ocupy when converted to multibyte string by
 699    `str_to_multibyte'.  */
 700
 701 int
 702 parse_str_to_multibyte (str, len)
 703      unsigned char *str;
 704      int len;
 705 {
 706   unsigned char *endp = str + len;
 707   int bytes;
 708
 709   for (bytes = 0; str < endp; str++)
 710     bytes += (*str < 0x80) ? 1 : 2;
 711   return bytes;
 712 }
 713
 714
 715 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 716    that contains the same single-byte characters.  It actually
 717    converts all 8-bit characters to multibyte forms.  It is assured
 718    that we can use LEN bytes at STR as a work area and that is
 719    enough.  */
 720
 721 int
 722 str_to_multibyte (str, len, bytes)
 723      unsigned char *str;
 724      int len, bytes;
 725 {
 726   unsigned char *p = str, *endp = str + bytes;
 727   unsigned char *to;
 728
 729   while (p < endp && *p < 0x80) p++;
 730   if (p == endp)
 731     return bytes;
 732   to = p;
 733   bytes = endp - p;
 734   endp = str + len;
 735   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 736   p = endp - bytes;
 737   while (p < endp)
 738     {
 739       int c = *p++;
 740
 741       if (c >= 0x80)
 742         c = BYTE8_TO_CHAR (c);
 743       to += CHAR_STRING (c, to);
 744     }
 745   return (to - str);
 746 }
 747
 748 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 749    actually converts characters in the range 0x80..0xFF to
 750    unibyte.  */
 751
 752 int
 753 str_as_unibyte (str, bytes)
 754      unsigned char *str;
 755      int bytes;
 756 {
 757   const unsigned char *p = str, *endp = str + bytes;
 758   unsigned char *to;
 759   int c, len;
 760
 761   while (p < endp)
 762     {
 763       c = *p;
 764       len = BYTES_BY_CHAR_HEAD (c);
 765       if (CHAR_BYTE8_HEAD_P (c))
 766         break;
 767       p += len;
 768     }
 769   to = str + (p - str);
 770   while (p < endp)
 771     {
 772       c = *p;
 773       len = BYTES_BY_CHAR_HEAD (c);
 774       if (CHAR_BYTE8_HEAD_P (c))
 775         {
 776           c = STRING_CHAR_ADVANCE (p);
 777           *to++ = CHAR_TO_BYTE8 (c);
 778         }
 779       else
 780         {
 781           while (len--) *to++ = *p++;
 782         }
 783     }
 784   return (to - str);
 785 }
 786
 787 int
 788 string_count_byte8 (string)
 789      Lisp_Object string;
 790 {
 791   int multibyte = STRING_MULTIBYTE (string);
 792   int nbytes = SBYTES (string);
 793   unsigned char *p = SDATA (string);
 794   unsigned char *pend = p + nbytes;
 795   int count = 0;
 796   int c, len;
 797
 798   if (multibyte)
 799     while (p < pend)
 800       {
 801         c = *p;
 802         len = BYTES_BY_CHAR_HEAD (c);
 803
 804         if (CHAR_BYTE8_HEAD_P (c))
 805           count++;
 806         p += len;
 807       }
 808   else
 809     while (p < pend)
 810       {
 811         if (*p++ >= 0x80)
 812           count++;
 813       }
 814   return count;
 815 }
 816
 817
 818 Lisp_Object
 819 string_escape_byte8 (string)
 820      Lisp_Object string;
 821 {
 822   int nchars = SCHARS (string);
 823   int nbytes = SBYTES (string);
 824   int multibyte = STRING_MULTIBYTE (string);
 825   int byte8_count;
 826   const unsigned char *src, *src_end;
 827   unsigned char *dst;
 828   Lisp_Object val;
 829   int c, len;
 830
 831   if (multibyte && nchars == nbytes)
 832     return string;
 833
 834   byte8_count = string_count_byte8 (string);
 835
 836   if (byte8_count == 0)
 837     return string;
 838
 839   if (multibyte)
 840     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 841     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 842                                         nbytes + byte8_count * 2);
 843   else
 844     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 845     val = make_uninit_string (nbytes + byte8_count * 3);
 846
 847   src = SDATA (string);
 848   src_end = src + nbytes;
 849   dst = SDATA (val);
 850   if (multibyte)
 851     while (src < src_end)
 852       {
 853         c = *src;
 854         len = BYTES_BY_CHAR_HEAD (c);
 855
 856         if (CHAR_BYTE8_HEAD_P (c))
 857           {
 858             c = STRING_CHAR_ADVANCE (src);
 859             c = CHAR_TO_BYTE8 (c);
 860             sprintf ((char *) dst, "\\%03o", c);
 861             dst += 4;
 862           }
 863         else
 864           while (len--) *dst++ = *src++;
 865       }
 866   else
 867     while (src < src_end)
 868       {
 869         c = *src++;
 870         if (c >= 0x80)
 871           {
 872             sprintf ((char *) dst, "\\%03o", c);
 873             dst += 4;
 874           }
 875         else
 876           *dst++ = c;
 877       }
 878   return val;
 879 }
 880
 881 \f
 882 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 883        doc: /*
 884 Concatenate all the argument characters and make the result a string.
 885 usage: (string &rest CHARACTERS)  */)
 886      (n, args)
 887      int n;
 888      Lisp_Object *args;
 889 {
 890   int i;
 891   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 892   unsigned char *p = buf;
 893   int c;
 894
 895   for (i = 0; i < n; i++)
 896     {
 897       CHECK_CHARACTER (args[i]);
 898       c = XINT (args[i]);
 899       p += CHAR_STRING (c, p);
 900     }
 901
 902   return make_string_from_bytes ((char *) buf, n, p - buf);
 903 }
 904
 905 void
 906 init_character_once ()
 907 {
 908 }
 909
 910 #ifdef emacs
 911
 912 void
 913 syms_of_character ()
 914 {
 915   DEFSYM (Qcharacterp, "characterp");
 916   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 917
 918   staticpro (&Vchar_unify_table);
 919   Vchar_unify_table = Qnil;
 920
 921   defsubr (&Smax_char);
 922   defsubr (&Scharacterp);
 923   defsubr (&Sunibyte_char_to_multibyte);
 924   defsubr (&Smultibyte_char_to_unibyte);
 925   defsubr (&Schar_bytes);
 926   defsubr (&Schar_width);
 927   defsubr (&Sstring_width);
 928   defsubr (&Schar_direction);
 929   defsubr (&Sstring);
 930
 931   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 932                doc: /*
 933 Vector recording all translation tables ever defined.
 934 Each element is a pair (SYMBOL . TABLE) relating the table to the
 935 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 936   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 937
 938   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 939                doc: /*
 940 A char-table for characters which invoke auto-filling.
 941 Such characters have value t in this table.  */);
 942   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 943   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 944   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 945
 946   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
 947                doc: /*
 948 A char-table for width (columns) of each character.  */);
 949   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
 950   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
 951   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
 952                         make_number (4));
 953
 954   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
 955                doc: /* A char-table for direction of each character.  */);
 956   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
 957
 958   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
 959                doc: /* A char-table for each printable character.  */);
 960   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
 961   Fset_char_table_range (Vprintable_chars,
 962                          Fcons (make_number (32), make_number (126)), Qt);
 963   Fset_char_table_range (Vprintable_chars,
 964                          Fcons (make_number (160),
 965                                 make_number (MAX_5_BYTE_CHAR)), Qt);
 966
 967   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
 968                doc: /* Char table of script symbols.
 969 It has one extra slot whose value is a list of script symbols.  */);
 970
 971   /* Intern this now in case it isn't already done.
 972      Setting this variable twice is harmless.
 973      But don't staticpro it here--that is done in alloc.c.  */
 974   Qchar_table_extra_slots = intern ("char-table-extra-slots");
 975   DEFSYM (Qchar_script_table, "char-script-table");
 976   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
 977   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 978 }
 979
 980 #endif /* emacs */
 981
 982 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
 983    (do not change this comment) */