code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 static Lisp_Object Qchar_script_table;
  85
  86 /* Mapping table from unibyte chars to multibyte chars.  */
  87 int unibyte_to_multibyte_table[256];
  88
  89 \f
  90
  91 /* Store multibyte form of character C at P.  If C has modifier bits,
  92    handle them appropriately.  */
  93
  94 int
  95 char_string (c, p)
  96      int c;
  97      unsigned char *p;
  98 {
  99   int bytes;
 100
 101   if (c & CHAR_MODIFIER_MASK)
 102     {
 103       /* As an non-ASCII character can't have modifier bits, we just
 104          ignore the bits.  */
 105       if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 106         {
 107           /* For Meta, Shift, and Control modifiers, we need special care.  */
 108           if (c & CHAR_META)
 109             {
 110               /* Move the meta bit to the right place for a string.  */
 111               c = (c & ~CHAR_META) | 0x80;
 112             }
 113           if (c & CHAR_SHIFT)
 114             {
 115               /* Shift modifier is valid only with [A-Za-z].  */
 116               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 117                 c &= ~CHAR_SHIFT;
 118               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 119                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 120             }
 121           if (c & CHAR_CTL)
 122             {
 123               /* Simulate the code in lread.c.  */
 124               /* Allow `\C- ' and `\C-?'.  */
 125               if (c == (CHAR_CTL | ' '))
 126                 c = 0;
 127               else if (c == (CHAR_CTL | '?'))
 128                 c = 127;
 129               /* ASCII control chars are made from letters (both cases),
 130                  as well as the non-letters within 0100...0137.  */
 131               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 132                 c &= (037 | (~0177 & ~CHAR_CTL));
 133               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 134                 c &= (037 | (~0177 & ~CHAR_CTL));
 135             }
 136         }
 137
 138       /* If C still has any modifier bits, just ignore it.  */
 139       c &= ~CHAR_MODIFIER_MASK;
 140     }
 141
 142   MAYBE_UNIFY_CHAR (c);
 143
 144   if (c <= MAX_3_BYTE_CHAR)
 145     {
 146       bytes = CHAR_STRING (c, p);
 147     }
 148   else if (c <= MAX_4_BYTE_CHAR)
 149     {
 150       p[0] = (0xF0 | (c >> 18));
 151       p[1] = (0x80 | ((c >> 12) & 0x3F));
 152       p[2] = (0x80 | ((c >> 6) & 0x3F));
 153       p[3] = (0x80 | (c & 0x3F));
 154       bytes = 4;
 155     }
 156   else if (c <= MAX_5_BYTE_CHAR)
 157     {
 158       p[0] = 0xF8;
 159       p[1] = (0x80 | ((c >> 18) & 0x0F));
 160       p[2] = (0x80 | ((c >> 12) & 0x3F));
 161       p[3] = (0x80 | ((c >> 6) & 0x3F));
 162       p[4] = (0x80 | (c & 0x3F));
 163       bytes = 5;
 164     }
 165   else
 166     {
 167       c = CHAR_TO_BYTE8 (c);
 168       bytes = BYTE8_STRING (c, p);
 169     }
 170
 171   return bytes;
 172 }
 173
 174
 175 /* Return a character whose multibyte form is at P.  Set LEN is not
 176    NULL, it must be a pointer to integer.  In that case, set *LEN to
 177    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 178    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 179    the ending address (i.e. the starting address of the next
 180    character) of the multibyte form.  */
 181
 182 int
 183 string_char (p, advanced, len)
 184      const unsigned char *p;
 185      const unsigned char **advanced;
 186      int *len;
 187 {
 188   int c;
 189   const unsigned char *saved_p = p;
 190
 191   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 192     {
 193       c = STRING_CHAR_ADVANCE (p);
 194     }
 195   else if (! (*p & 0x08))
 196     {
 197       c = ((((p)[0] & 0xF) << 18)
 198            | (((p)[1] & 0x3F) << 12)
 199            | (((p)[2] & 0x3F) << 6)
 200            | ((p)[3] & 0x3F));
 201       p += 4;
 202     }
 203   else
 204     {
 205       c = ((((p)[1] & 0x3F) << 18)
 206            | (((p)[2] & 0x3F) << 12)
 207            | (((p)[3] & 0x3F) << 6)
 208            | ((p)[4] & 0x3F));
 209       p += 5;
 210     }
 211
 212   MAYBE_UNIFY_CHAR (c);
 213
 214   if (len)
 215     *len = p - saved_p;
 216   if (advanced)
 217     *advanced = p;
 218   return c;
 219 }
 220
 221
 222 /* Translate character C by translation table TABLE.  If C is
 223    negative, translate a character specified by CHARSET and CODE.  If
 224    no translation is found in TABLE, return the untranslated
 225    character.  If TABLE is a list, elements are char tables.  In this
 226    case, translace C by all tables.  */
 227
 228 int
 229 translate_char (table, c)
 230      Lisp_Object table;
 231      int c;
 232 {
 233   if (CHAR_TABLE_P (table))
 234     {
 235       Lisp_Object ch;
 236
 237       ch = CHAR_TABLE_REF (table, c);
 238       if (CHARACTERP (ch))
 239         c = XINT (ch);
 240     }
 241   else
 242     {
 243       for (; CONSP (table); table = XCDR (table))
 244         c = translate_char (XCAR (table), c);
 245     }
 246   return c;
 247 }
 248
 249 /* Convert the multibyte character C to unibyte 8-bit character based
 250    on the current value of charset_unibyte.  If dimension of
 251    charset_unibyte is more than one, return (C & 0xFF).
 252
 253    The argument REV_TBL is now ignored.  It will be removed in the
 254    future.  */
 255
 256 int
 257 multibyte_char_to_unibyte (c, rev_tbl)
 258      int c;
 259      Lisp_Object rev_tbl;
 260 {
 261   struct charset *charset;
 262   unsigned c1;
 263
 264   if (CHAR_BYTE8_P (c))
 265     return CHAR_TO_BYTE8 (c);
 266   charset = CHARSET_FROM_ID (charset_unibyte);
 267   c1 = ENCODE_CHAR (charset, c);
 268   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 269 }
 270
 271
 272 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 273        doc: /* Return non-nil if OBJECT is a character.  */)
 274      (object, ignore)
 275      Lisp_Object object, ignore;
 276 {
 277   return (CHARACTERP (object) ? Qt : Qnil);
 278 }
 279
 280 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 281        doc: /* Return the character of the maximum code.  */)
 282      ()
 283 {
 284   return make_number (MAX_CHAR);
 285 }
 286
 287 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 288        Sunibyte_char_to_multibyte, 1, 1, 0,
 289        doc: /* Convert the unibyte character CH to multibyte character.
 290 The multibyte character is a result of decoding CH by
 291 the current unibyte charset (see `unibyte-charset').  */)
 292      (ch)
 293      Lisp_Object ch;
 294 {
 295   int c;
 296   struct charset *charset;
 297
 298   CHECK_CHARACTER (ch);
 299   c = XFASTINT (ch);
 300   if (c >= 0400)
 301     error ("Invalid unibyte character: %d", c);
 302   charset = CHARSET_FROM_ID (charset_unibyte);
 303   c = DECODE_CHAR (charset, c);
 304   if (c < 0)
 305     c = BYTE8_TO_CHAR (XFASTINT (ch));
 306   return make_number (c);
 307 }
 308
 309 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 310        Smultibyte_char_to_unibyte, 1, 1, 0,
 311        doc: /* Convert the multibyte character CH to unibyte character.\n\
 312 The unibyte character is a result of encoding CH by
 313 the current primary charset (value of `charset-primary').  */)
 314      (ch)
 315      Lisp_Object ch;
 316 {
 317   int c;
 318
 319   CHECK_CHARACTER (ch);
 320   c = XFASTINT (ch);
 321   c = CHAR_TO_BYTE8 (c);
 322   return make_number (c);
 323 }
 324
 325 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 326        doc: /* Return 1 regardless of the argument CHAR.
 327 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 328      (ch)
 329      Lisp_Object ch;
 330 {
 331   CHECK_CHARACTER (ch);
 332   return make_number (1);
 333 }
 334
 335 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 336        doc: /* Return width of CHAR when displayed in the current buffer.
 337 The width is measured by how many columns it occupies on the screen.
 338 Tab is taken to occupy `tab-width' columns.  */)
 339      (ch)
 340        Lisp_Object ch;
 341 {
 342   Lisp_Object disp;
 343   int c, width;
 344   struct Lisp_Char_Table *dp = buffer_display_table ();
 345
 346   CHECK_CHARACTER (ch);
 347   c = XINT (ch);
 348
 349   /* Get the way the display table would display it.  */
 350   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 351
 352   if (VECTORP (disp))
 353     width = ASIZE (disp);
 354   else
 355     width = CHAR_WIDTH (c);
 356
 357   return make_number (width);
 358 }
 359
 360 /* Return width of string STR of length LEN when displayed in the
 361    current buffer.  The width is measured by how many columns it
 362    occupies on the screen.  If PRECISION > 0, return the width of
 363    longest substring that doesn't exceed PRECISION, and set number of
 364    characters and bytes of the substring in *NCHARS and *NBYTES
 365    respectively.  */
 366
 367 int
 368 c_string_width (str, len, precision, nchars, nbytes)
 369      const unsigned char *str;
 370      int precision, *nchars, *nbytes;
 371 {
 372   int i = 0, i_byte = 0;
 373   int width = 0;
 374   struct Lisp_Char_Table *dp = buffer_display_table ();
 375
 376   while (i_byte < len)
 377     {
 378       int bytes, thiswidth;
 379       Lisp_Object val;
 380       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 381
 382       if (dp)
 383         {
 384           val = DISP_CHAR_VECTOR (dp, c);
 385           if (VECTORP (val))
 386             thiswidth = XVECTOR (val)->size;
 387           else
 388             thiswidth = CHAR_WIDTH (c);
 389         }
 390       else
 391         {
 392           thiswidth = CHAR_WIDTH (c);
 393         }
 394
 395       if (precision > 0
 396           && (width + thiswidth > precision))
 397         {
 398           *nchars = i;
 399           *nbytes = i_byte;
 400           return width;
 401         }
 402       i++;
 403       i_byte += bytes;
 404       width += thiswidth;
 405   }
 406
 407   if (precision > 0)
 408     {
 409       *nchars = i;
 410       *nbytes = i_byte;
 411     }
 412
 413   return width;
 414 }
 415
 416 /* Return width of string STR of length LEN when displayed in the
 417    current buffer.  The width is measured by how many columns it
 418    occupies on the screen.  */
 419
 420 int
 421 strwidth (str, len)
 422      unsigned char *str;
 423      int len;
 424 {
 425   return c_string_width (str, len, -1, NULL, NULL);
 426 }
 427
 428 /* Return width of Lisp string STRING when displayed in the current
 429    buffer.  The width is measured by how many columns it occupies on
 430    the screen while paying attention to compositions.  If PRECISION >
 431    0, return the width of longest substring that doesn't exceed
 432    PRECISION, and set number of characters and bytes of the substring
 433    in *NCHARS and *NBYTES respectively.  */
 434
 435 int
 436 lisp_string_width (string, precision, nchars, nbytes)
 437      Lisp_Object string;
 438      int precision, *nchars, *nbytes;
 439 {
 440   int len = SCHARS (string);
 441   unsigned char *str = SDATA (string);
 442   int i = 0, i_byte = 0;
 443   int width = 0;
 444   struct Lisp_Char_Table *dp = buffer_display_table ();
 445
 446   while (i < len)
 447     {
 448       int chars, bytes, thiswidth;
 449       Lisp_Object val;
 450       int cmp_id;
 451       EMACS_INT ignore, end;
 452
 453       if (find_composition (i, -1, &ignore, &end, &val, string)
 454           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 455               >= 0))
 456         {
 457           thiswidth = composition_table[cmp_id]->width;
 458           chars = end - i;
 459           bytes = string_char_to_byte (string, end) - i_byte;
 460         }
 461       else if (dp)
 462         {
 463           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 464
 465           chars = 1;
 466           val = DISP_CHAR_VECTOR (dp, c);
 467           if (VECTORP (val))
 468             thiswidth = XVECTOR (val)->size;
 469           else
 470             thiswidth = CHAR_WIDTH (c);
 471         }
 472       else
 473         {
 474           int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 475
 476           chars = 1;
 477           thiswidth = CHAR_WIDTH (c);
 478         }
 479
 480       if (precision > 0
 481           && (width + thiswidth > precision))
 482         {
 483           *nchars = i;
 484           *nbytes = i_byte;
 485           return width;
 486         }
 487       i += chars;
 488       i_byte += bytes;
 489       width += thiswidth;
 490   }
 491
 492   if (precision > 0)
 493     {
 494       *nchars = i;
 495       *nbytes = i_byte;
 496     }
 497
 498   return width;
 499 }
 500
 501 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 502        doc: /* Return width of STRING when displayed in the current buffer.
 503 Width is measured by how many columns it occupies on the screen.
 504 When calculating width of a multibyte character in STRING,
 505 only the base leading-code is considered; the validity of
 506 the following bytes is not checked.  Tabs in STRING are always
 507 taken to occupy `tab-width' columns.  */)
 508      (str)
 509      Lisp_Object str;
 510 {
 511   Lisp_Object val;
 512
 513   CHECK_STRING (str);
 514   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 515   return val;
 516 }
 517
 518 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 519        doc: /* Return the direction of CHAR.
 520 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 521      (ch)
 522      Lisp_Object ch;
 523 {
 524   int c;
 525
 526   CHECK_CHARACTER (ch);
 527   c = XINT (ch);
 528   return CHAR_TABLE_REF (Vchar_direction_table, c);
 529 }
 530
 531 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
 532        doc: /* Return number of characters between BEG and END.
 533 This is now an obsolete function.  We keep it just for backward compatibility.  */)
 534      (beg, end)
 535      Lisp_Object beg, end;
 536 {
 537   int from, to;
 538
 539   CHECK_NUMBER_COERCE_MARKER (beg);
 540   CHECK_NUMBER_COERCE_MARKER (end);
 541
 542   from = min (XFASTINT (beg), XFASTINT (end));
 543   to = max (XFASTINT (beg), XFASTINT (end));
 544
 545   return make_number (to - from);
 546 }
 547
 548 /* Return the number of characters in the NBYTES bytes at PTR.
 549    This works by looking at the contents and checking for multibyte
 550    sequences while assuming that there's no invalid sequence.
 551    However, if the current buffer has enable-multibyte-characters =
 552    nil, we treat each byte as a character.  */
 553
 554 int
 555 chars_in_text (ptr, nbytes)
 556      const unsigned char *ptr;
 557      int nbytes;
 558 {
 559   /* current_buffer is null at early stages of Emacs initialization.  */
 560   if (current_buffer == 0
 561       || NILP (current_buffer->enable_multibyte_characters))
 562     return nbytes;
 563
 564   return multibyte_chars_in_text (ptr, nbytes);
 565 }
 566
 567 /* Return the number of characters in the NBYTES bytes at PTR.
 568    This works by looking at the contents and checking for multibyte
 569    sequences while assuming that there's no invalid sequence.  It
 570    ignores enable-multibyte-characters.  */
 571
 572 int
 573 multibyte_chars_in_text (ptr, nbytes)
 574      const unsigned char *ptr;
 575      int nbytes;
 576 {
 577   const unsigned char *endp = ptr + nbytes;
 578   int chars = 0;
 579
 580   while (ptr < endp)
 581     {
 582       int len = MULTIBYTE_LENGTH (ptr, endp);
 583
 584       if (len == 0)
 585         abort ();
 586       ptr += len;
 587       chars++;
 588     }
 589
 590   return chars;
 591 }
 592
 593 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 594    characters and bytes in it, and store them in *NCHARS and *NBYTES
 595    respectively.  On counting bytes, pay attention to that 8-bit
 596    characters not constructing a valid multibyte sequence are
 597    represented by 2-byte in a multibyte text.  */
 598
 599 void
 600 parse_str_as_multibyte (str, len, nchars, nbytes)
 601      const unsigned char *str;
 602      int len, *nchars, *nbytes;
 603 {
 604   const unsigned char *endp = str + len;
 605   int n, chars = 0, bytes = 0;
 606
 607   if (len >= MAX_MULTIBYTE_LENGTH)
 608     {
 609       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 610       while (str < adjusted_endp)
 611         {
 612           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 613             str += n, bytes += n;
 614           else
 615             str++, bytes += 2;
 616           chars++;
 617         }
 618     }
 619   while (str < endp)
 620     {
 621       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 622         str += n, bytes += n;
 623       else
 624         str++, bytes += 2;
 625       chars++;
 626     }
 627
 628   *nchars = chars;
 629   *nbytes = bytes;
 630   return;
 631 }
 632
 633 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 634    It actually converts only such 8-bit characters that don't contruct
 635    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 636    NCHARS is nonzero, set *NCHARS to the number of characters in the
 637    text.  It is assured that we can use LEN bytes at STR as a work
 638    area and that is enough.  Return the number of bytes of the
 639    resulting text.  */
 640
 641 int
 642 str_as_multibyte (str, len, nbytes, nchars)
 643      unsigned char *str;
 644      int len, nbytes, *nchars;
 645 {
 646   unsigned char *p = str, *endp = str + nbytes;
 647   unsigned char *to;
 648   int chars = 0;
 649   int n;
 650
 651   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 652     {
 653       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 654       while (p < adjusted_endp
 655              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 656         p += n, chars++;
 657     }
 658   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 659     p += n, chars++;
 660   if (nchars)
 661     *nchars = chars;
 662   if (p == endp)
 663     return nbytes;
 664
 665   to = p;
 666   nbytes = endp - p;
 667   endp = str + len;
 668   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 669   p = endp - nbytes;
 670
 671   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 672     {
 673       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 674       while (p < adjusted_endp)
 675         {
 676           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 677             {
 678               while (n--)
 679                 *to++ = *p++;
 680             }
 681           else
 682             {
 683               int c = *p++;
 684               c = BYTE8_TO_CHAR (c);
 685               to += CHAR_STRING (c, to);
 686             }
 687         }
 688       chars++;
 689     }
 690   while (p < endp)
 691     {
 692       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 693         {
 694           while (n--)
 695             *to++ = *p++;
 696         }
 697       else
 698         {
 699           int c = *p++;
 700           c = BYTE8_TO_CHAR (c);
 701           to += CHAR_STRING (c, to);
 702         }
 703       chars++;
 704     }
 705   if (nchars)
 706     *nchars = chars;
 707   return (to - str);
 708 }
 709
 710 /* Parse unibyte string at STR of LEN bytes, and return the number of
 711    bytes it may ocupy when converted to multibyte string by
 712    `str_to_multibyte'.  */
 713
 714 int
 715 parse_str_to_multibyte (str, len)
 716      unsigned char *str;
 717      int len;
 718 {
 719   unsigned char *endp = str + len;
 720   int bytes;
 721
 722   for (bytes = 0; str < endp; str++)
 723     bytes += (*str < 0x80) ? 1 : 2;
 724   return bytes;
 725 }
 726
 727
 728 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 729    that contains the same single-byte characters.  It actually
 730    converts all 8-bit characters to multibyte forms.  It is assured
 731    that we can use LEN bytes at STR as a work area and that is
 732    enough.  */
 733
 734 int
 735 str_to_multibyte (str, len, bytes)
 736      unsigned char *str;
 737      int len, bytes;
 738 {
 739   unsigned char *p = str, *endp = str + bytes;
 740   unsigned char *to;
 741
 742   while (p < endp && *p < 0x80) p++;
 743   if (p == endp)
 744     return bytes;
 745   to = p;
 746   bytes = endp - p;
 747   endp = str + len;
 748   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 749   p = endp - bytes;
 750   while (p < endp)
 751     {
 752       int c = *p++;
 753
 754       if (c >= 0x80)
 755         c = BYTE8_TO_CHAR (c);
 756       to += CHAR_STRING (c, to);
 757     }
 758   return (to - str);
 759 }
 760
 761 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 762    actually converts characters in the range 0x80..0xFF to
 763    unibyte.  */
 764
 765 int
 766 str_as_unibyte (str, bytes)
 767      unsigned char *str;
 768      int bytes;
 769 {
 770   const unsigned char *p = str, *endp = str + bytes;
 771   unsigned char *to;
 772   int c, len;
 773
 774   while (p < endp)
 775     {
 776       c = *p;
 777       len = BYTES_BY_CHAR_HEAD (c);
 778       if (CHAR_BYTE8_HEAD_P (c))
 779         break;
 780       p += len;
 781     }
 782   to = str + (p - str);
 783   while (p < endp)
 784     {
 785       c = *p;
 786       len = BYTES_BY_CHAR_HEAD (c);
 787       if (CHAR_BYTE8_HEAD_P (c))
 788         {
 789           c = STRING_CHAR_ADVANCE (p);
 790           *to++ = CHAR_TO_BYTE8 (c);
 791         }
 792       else
 793         {
 794           while (len--) *to++ = *p++;
 795         }
 796     }
 797   return (to - str);
 798 }
 799
 800 int
 801 string_count_byte8 (string)
 802      Lisp_Object string;
 803 {
 804   int multibyte = STRING_MULTIBYTE (string);
 805   int nbytes = SBYTES (string);
 806   unsigned char *p = SDATA (string);
 807   unsigned char *pend = p + nbytes;
 808   int count = 0;
 809   int c, len;
 810
 811   if (multibyte)
 812     while (p < pend)
 813       {
 814         c = *p;
 815         len = BYTES_BY_CHAR_HEAD (c);
 816
 817         if (CHAR_BYTE8_HEAD_P (c))
 818           count++;
 819         p += len;
 820       }
 821   else
 822     while (p < pend)
 823       {
 824         if (*p++ >= 0x80)
 825           count++;
 826       }
 827   return count;
 828 }
 829
 830
 831 Lisp_Object
 832 string_escape_byte8 (string)
 833      Lisp_Object string;
 834 {
 835   int nchars = SCHARS (string);
 836   int nbytes = SBYTES (string);
 837   int multibyte = STRING_MULTIBYTE (string);
 838   int byte8_count;
 839   const unsigned char *src, *src_end;
 840   unsigned char *dst;
 841   Lisp_Object val;
 842   int c, len;
 843
 844   if (multibyte && nchars == nbytes)
 845     return string;
 846
 847   byte8_count = string_count_byte8 (string);
 848
 849   if (byte8_count == 0)
 850     return string;
 851
 852   if (multibyte)
 853     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 854     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 855                                         nbytes + byte8_count * 2);
 856   else
 857     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 858     val = make_uninit_string (nbytes + byte8_count * 3);
 859
 860   src = SDATA (string);
 861   src_end = src + nbytes;
 862   dst = SDATA (val);
 863   if (multibyte)
 864     while (src < src_end)
 865       {
 866         c = *src;
 867         len = BYTES_BY_CHAR_HEAD (c);
 868
 869         if (CHAR_BYTE8_HEAD_P (c))
 870           {
 871             c = STRING_CHAR_ADVANCE (src);
 872             c = CHAR_TO_BYTE8 (c);
 873             sprintf ((char *) dst, "\\%03o", c);
 874             dst += 4;
 875           }
 876         else
 877           while (len--) *dst++ = *src++;
 878       }
 879   else
 880     while (src < src_end)
 881       {
 882         c = *src++;
 883         if (c >= 0x80)
 884           {
 885             sprintf ((char *) dst, "\\%03o", c);
 886             dst += 4;
 887           }
 888         else
 889           *dst++ = c;
 890       }
 891   return val;
 892 }
 893
 894 \f
 895 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 896        doc: /*
 897 Concatenate all the argument characters and make the result a string.
 898 usage: (string &rest CHARACTERS)  */)
 899      (n, args)
 900      int n;
 901      Lisp_Object *args;
 902 {
 903   int i;
 904   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 905   unsigned char *p = buf;
 906   int c;
 907
 908   for (i = 0; i < n; i++)
 909     {
 910       CHECK_CHARACTER (args[i]);
 911       c = XINT (args[i]);
 912       p += CHAR_STRING (c, p);
 913     }
 914
 915   return make_string_from_bytes ((char *) buf, n, p - buf);
 916 }
 917
 918 void
 919 init_character_once ()
 920 {
 921 }
 922
 923 #ifdef emacs
 924
 925 void
 926 syms_of_character ()
 927 {
 928   DEFSYM (Qcharacterp, "characterp");
 929   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 930
 931   staticpro (&Vchar_unify_table);
 932   Vchar_unify_table = Qnil;
 933
 934   defsubr (&Smax_char);
 935   defsubr (&Scharacterp);
 936   defsubr (&Sunibyte_char_to_multibyte);
 937   defsubr (&Smultibyte_char_to_unibyte);
 938   defsubr (&Schar_bytes);
 939   defsubr (&Schar_width);
 940   defsubr (&Sstring_width);
 941   defsubr (&Schar_direction);
 942   defsubr (&Schars_in_region);
 943   defsubr (&Sstring);
 944
 945   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 946                doc: /*
 947 Vector recording all translation tables ever defined.
 948 Each element is a pair (SYMBOL . TABLE) relating the table to the
 949 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 950   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 951
 952   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 953                doc: /*
 954 A char-table for characters which invoke auto-filling.
 955 Such characters have value t in this table.  */);
 956   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 957   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 958   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 959
 960   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
 961                doc: /*
 962 A char-table for width (columns) of each character.  */);
 963   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
 964   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
 965   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
 966                         make_number (4));
 967
 968   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
 969                doc: /* A char-table for direction of each character.  */);
 970   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
 971
 972   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
 973                doc: /* A char-table for each printable character.  */);
 974   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
 975   Fset_char_table_range (Vprintable_chars,
 976                          Fcons (make_number (32), make_number (126)), Qt);
 977   Fset_char_table_range (Vprintable_chars,
 978                          Fcons (make_number (160),
 979                                 make_number (MAX_5_BYTE_CHAR)), Qt);
 980
 981   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
 982                doc: /* Char table of script symbols.
 983 It has one extra slot whose value is a list of script symbols.  */);
 984
 985   /* Intern this now in case it isn't already done.
 986      Setting this variable twice is harmless.
 987      But don't staticpro it here--that is done in alloc.c.  */
 988   Qchar_table_extra_slots = intern ("char-table-extra-slots");
 989   DEFSYM (Qchar_script_table, "char-script-table");
 990   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
 991   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 992 }
 993
 994 #endif /* emacs */
 995
 996 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
 997    (do not change this comment) */