code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
   3      2010, 2011, 2012  Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5      Licensed to the Free Software Foundation.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 Lisp_Object Vunicode_category_table;
  90 \f
  91
  92 /* If character code C has modifier masks, reflect them to the
  93    character code if possible.  Return the resulting code.  */
  94
  95 int
  96 char_resolve_modifier_mask (c)
  97      int c;
  98 {
  99   /* A non-ASCII character can't reflect modifier bits to the code.  */
 100   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 101     return c;
 102
 103   /* For Meta, Shift, and Control modifiers, we need special care.  */
 104   if (c & CHAR_SHIFT)
 105     {
 106       /* Shift modifier is valid only with [A-Za-z].  */
 107       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 108         c &= ~CHAR_SHIFT;
 109       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 110         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 111       /* Shift modifier for control characters and SPC is ignored.  */
 112       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 113         c &= ~CHAR_SHIFT;
 114     }
 115   if (c & CHAR_CTL)
 116     {
 117       /* Simulate the code in lread.c.  */
 118       /* Allow `\C- ' and `\C-?'.  */
 119       if ((c & 0377) == ' ')
 120         c &= ~0177 & ~ CHAR_CTL;
 121       else if ((c & 0377) == '?')
 122         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 123       /* ASCII control chars are made from letters (both cases),
 124          as well as the non-letters within 0100...0137.  */
 125       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 126         c &= (037 | (~0177 & ~CHAR_CTL));
 127       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 128         c &= (037 | (~0177 & ~CHAR_CTL));
 129     }
 130 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 131   if (c & CHAR_META)
 132     {
 133       /* Move the meta bit to the right place for a string.  */
 134       c = (c & ~CHAR_META) | 0x80;
 135     }
 136 #endif
 137
 138   return c;
 139 }
 140
 141
 142 /* Store multibyte form of character C at P.  If C has modifier bits,
 143    handle them appropriately.  */
 144
 145 int
 146 char_string (c, p)
 147      unsigned c;
 148      unsigned char *p;
 149 {
 150   int bytes;
 151
 152   if (c & CHAR_MODIFIER_MASK)
 153     {
 154       c = (unsigned) char_resolve_modifier_mask ((int) c);
 155       /* If C still has any modifier bits, just ignore it.  */
 156       c &= ~CHAR_MODIFIER_MASK;
 157     }
 158
 159   MAYBE_UNIFY_CHAR (c);
 160
 161   if (c <= MAX_3_BYTE_CHAR)
 162     {
 163       bytes = CHAR_STRING (c, p);
 164     }
 165   else if (c <= MAX_4_BYTE_CHAR)
 166     {
 167       p[0] = (0xF0 | (c >> 18));
 168       p[1] = (0x80 | ((c >> 12) & 0x3F));
 169       p[2] = (0x80 | ((c >> 6) & 0x3F));
 170       p[3] = (0x80 | (c & 0x3F));
 171       bytes = 4;
 172     }
 173   else if (c <= MAX_5_BYTE_CHAR)
 174     {
 175       p[0] = 0xF8;
 176       p[1] = (0x80 | ((c >> 18) & 0x0F));
 177       p[2] = (0x80 | ((c >> 12) & 0x3F));
 178       p[3] = (0x80 | ((c >> 6) & 0x3F));
 179       p[4] = (0x80 | (c & 0x3F));
 180       bytes = 5;
 181     }
 182   else if (c <= MAX_CHAR)
 183     {
 184       c = CHAR_TO_BYTE8 (c);
 185       bytes = BYTE8_STRING (c, p);
 186     }
 187   else
 188     error ("Invalid character: %d", c);
 189
 190   return bytes;
 191 }
 192
 193
 194 /* Return a character whose multibyte form is at P.  Set LEN is not
 195    NULL, it must be a pointer to integer.  In that case, set *LEN to
 196    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 197    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 198    the ending address (i.e. the starting address of the next
 199    character) of the multibyte form.  */
 200
 201 int
 202 string_char (p, advanced, len)
 203      const unsigned char *p;
 204      const unsigned char **advanced;
 205      int *len;
 206 {
 207   int c;
 208   const unsigned char *saved_p = p;
 209
 210   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 211     {
 212       c = STRING_CHAR_ADVANCE (p);
 213     }
 214   else if (! (*p & 0x08))
 215     {
 216       c = ((((p)[0] & 0xF) << 18)
 217            | (((p)[1] & 0x3F) << 12)
 218            | (((p)[2] & 0x3F) << 6)
 219            | ((p)[3] & 0x3F));
 220       p += 4;
 221     }
 222   else
 223     {
 224       c = ((((p)[1] & 0x3F) << 18)
 225            | (((p)[2] & 0x3F) << 12)
 226            | (((p)[3] & 0x3F) << 6)
 227            | ((p)[4] & 0x3F));
 228       p += 5;
 229     }
 230
 231   MAYBE_UNIFY_CHAR (c);
 232
 233   if (len)
 234     *len = p - saved_p;
 235   if (advanced)
 236     *advanced = p;
 237   return c;
 238 }
 239
 240
 241 /* Translate character C by translation table TABLE.  If C is
 242    negative, translate a character specified by CHARSET and CODE.  If
 243    no translation is found in TABLE, return the untranslated
 244    character.  If TABLE is a list, elements are char tables.  In this
 245    case, translace C by all tables.  */
 246
 247 int
 248 translate_char (table, c)
 249      Lisp_Object table;
 250      int c;
 251 {
 252   if (CHAR_TABLE_P (table))
 253     {
 254       Lisp_Object ch;
 255
 256       ch = CHAR_TABLE_REF (table, c);
 257       if (CHARACTERP (ch))
 258         c = XINT (ch);
 259     }
 260   else
 261     {
 262       for (; CONSP (table); table = XCDR (table))
 263         c = translate_char (XCAR (table), c);
 264     }
 265   return c;
 266 }
 267
 268 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 269    them, return (C & 0xFF).
 270
 271    The argument REV_TBL is now ignored.  It will be removed in the
 272    future.  */
 273
 274 int
 275 multibyte_char_to_unibyte (c, rev_tbl)
 276      int c;
 277      Lisp_Object rev_tbl;
 278 {
 279   if (c < 0x80)
 280     return c;
 281   if (CHAR_BYTE8_P (c))
 282     return CHAR_TO_BYTE8 (c);
 283   return (c & 0xFF);
 284 }
 285
 286 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 287    by charset_unibyte.  */
 288
 289 int
 290 multibyte_char_to_unibyte_safe (c)
 291      int c;
 292 {
 293   if (c < 0x80)
 294     return c;
 295   if (CHAR_BYTE8_P (c))
 296     return CHAR_TO_BYTE8 (c);
 297   return -1;
 298 }
 299
 300 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 301        doc: /* Return non-nil if OBJECT is a character.  */)
 302      (object, ignore)
 303      Lisp_Object object, ignore;
 304 {
 305   return (CHARACTERP (object) ? Qt : Qnil);
 306 }
 307
 308 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 309        doc: /* Return the character of the maximum code.  */)
 310      ()
 311 {
 312   return make_number (MAX_CHAR);
 313 }
 314
 315 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 316        Sunibyte_char_to_multibyte, 1, 1, 0,
 317        doc: /* Convert the byte CH to multibyte character.  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   if (c >= 0x100)
 326     error ("Not a unibyte character: %d", c);
 327   MAKE_CHAR_MULTIBYTE (c);
 328   return make_number (c);
 329 }
 330
 331 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 332        Smultibyte_char_to_unibyte, 1, 1, 0,
 333        doc: /* Convert the multibyte character CH to a byte.
 334 If the multibyte character does not represent a byte, return -1.  */)
 335      (ch)
 336      Lisp_Object ch;
 337 {
 338   int cm;
 339
 340   CHECK_CHARACTER (ch);
 341   cm = XFASTINT (ch);
 342   if (cm < 256)
 343     /* Can't distinguish a byte read from a unibyte buffer from
 344        a latin1 char, so let's let it slide.  */
 345     return ch;
 346   else
 347     {
 348       int cu = CHAR_TO_BYTE_SAFE (cm);
 349       return make_number (cu);
 350     }
 351 }
 352
 353 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 354        doc: /* Return 1 regardless of the argument CHAR.
 355 This is now an obsolete function.  We keep it just for backward compatibility.
 356 usage: (char-bytes CHAR)  */)
 357      (ch)
 358      Lisp_Object ch;
 359 {
 360   CHECK_CHARACTER (ch);
 361   return make_number (1);
 362 }
 363
 364
 365 /* Return width (columns) of C considering the buffer display table DP. */
 366
 367 static int
 368 char_width (int c, struct Lisp_Char_Table *dp)
 369 {
 370   int width = CHAR_WIDTH (c);
 371
 372   if (dp)
 373     {
 374       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 375       int i;
 376
 377       if (VECTORP (disp))
 378         for (i = 0, width = 0; i < ASIZE (disp); i++)
 379           {
 380             ch = AREF (disp, i);
 381             if (CHARACTERP (ch))
 382               width += CHAR_WIDTH (XFASTINT (ch));
 383           }
 384     }
 385   return width;
 386 }
 387
 388
 389 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 390        doc: /* Return width of CHAR when displayed in the current buffer.
 391 The width is measured by how many columns it occupies on the screen.
 392 Tab is taken to occupy `tab-width' columns.
 393 usage: (char-width CHAR)  */)
 394      (ch)
 395        Lisp_Object ch;
 396 {
 397   int c, width;
 398
 399   CHECK_CHARACTER (ch);
 400   c = XINT (ch);
 401
 402   width = char_width (c, buffer_display_table ());
 403   return make_number (width);
 404 }
 405
 406 /* Return width of string STR of length LEN when displayed in the
 407    current buffer.  The width is measured by how many columns it
 408    occupies on the screen.  If PRECISION > 0, return the width of
 409    longest substring that doesn't exceed PRECISION, and set number of
 410    characters and bytes of the substring in *NCHARS and *NBYTES
 411    respectively.  */
 412
 413 int
 414 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 415 {
 416   int i = 0, i_byte = 0;
 417   int width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i_byte < len)
 421     {
 422       int bytes;
 423       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 424       int thiswidth = char_width (c, dp);
 425
 426       if (precision > 0
 427           && (width + thiswidth > precision))
 428         {
 429           *nchars = i;
 430           *nbytes = i_byte;
 431           return width;
 432         }
 433       i++;
 434       i_byte += bytes;
 435       width += thiswidth;
 436   }
 437
 438   if (precision > 0)
 439     {
 440       *nchars = i;
 441       *nbytes = i_byte;
 442     }
 443
 444   return width;
 445 }
 446
 447 /* Return width of string STR of length LEN when displayed in the
 448    current buffer.  The width is measured by how many columns it
 449    occupies on the screen.  */
 450
 451 int
 452 strwidth (str, len)
 453      unsigned char *str;
 454      int len;
 455 {
 456   return c_string_width (str, len, -1, NULL, NULL);
 457 }
 458
 459 /* Return width of Lisp string STRING when displayed in the current
 460    buffer.  The width is measured by how many columns it occupies on
 461    the screen while paying attention to compositions.  If PRECISION >
 462    0, return the width of longest substring that doesn't exceed
 463    PRECISION, and set number of characters and bytes of the substring
 464    in *NCHARS and *NBYTES respectively.  */
 465
 466 int
 467 lisp_string_width (string, precision, nchars, nbytes)
 468      Lisp_Object string;
 469      int precision, *nchars, *nbytes;
 470 {
 471   int len = SCHARS (string);
 472   /* This set multibyte to 0 even if STRING is multibyte when it
 473      contains only ascii and eight-bit-graphic, but that's
 474      intentional.  */
 475   int multibyte = len < SBYTES (string);
 476   unsigned char *str = SDATA (string);
 477   int i = 0, i_byte = 0;
 478   int width = 0;
 479   struct Lisp_Char_Table *dp = buffer_display_table ();
 480
 481   while (i < len)
 482     {
 483       int chars, bytes, thiswidth;
 484       Lisp_Object val;
 485       int cmp_id;
 486       EMACS_INT ignore, end;
 487
 488       if (find_composition (i, -1, &ignore, &end, &val, string)
 489           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 490               >= 0))
 491         {
 492           thiswidth = composition_table[cmp_id]->width;
 493           chars = end - i;
 494           bytes = string_char_to_byte (string, end) - i_byte;
 495         }
 496       else
 497         {
 498           int c;
 499
 500           if (multibyte)
 501             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 502           else
 503             c = str[i_byte], bytes = 1;
 504           chars = 1;
 505           thiswidth = char_width (c, dp);
 506         }
 507
 508       if (precision > 0
 509           && (width + thiswidth > precision))
 510         {
 511           *nchars = i;
 512           *nbytes = i_byte;
 513           return width;
 514         }
 515       i += chars;
 516       i_byte += bytes;
 517       width += thiswidth;
 518   }
 519
 520   if (precision > 0)
 521     {
 522       *nchars = i;
 523       *nbytes = i_byte;
 524     }
 525
 526   return width;
 527 }
 528
 529 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 530        doc: /* Return width of STRING when displayed in the current buffer.
 531 Width is measured by how many columns it occupies on the screen.
 532 When calculating width of a multibyte character in STRING,
 533 only the base leading-code is considered; the validity of
 534 the following bytes is not checked.  Tabs in STRING are always
 535 taken to occupy `tab-width' columns.
 536 usage: (string-width STRING)  */)
 537      (str)
 538      Lisp_Object str;
 539 {
 540   Lisp_Object val;
 541
 542   CHECK_STRING (str);
 543   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 544   return val;
 545 }
 546
 547 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 548        doc: /* Return the direction of CHAR.
 549 The returned value is 0 for left-to-right and 1 for right-to-left.
 550 usage: (char-direction CHAR)  */)
 551      (ch)
 552      Lisp_Object ch;
 553 {
 554   int c;
 555
 556   CHECK_CHARACTER (ch);
 557   c = XINT (ch);
 558   return CHAR_TABLE_REF (Vchar_direction_table, c);
 559 }
 560
 561 /* Return the number of characters in the NBYTES bytes at PTR.
 562    This works by looking at the contents and checking for multibyte
 563    sequences while assuming that there's no invalid sequence.
 564    However, if the current buffer has enable-multibyte-characters =
 565    nil, we treat each byte as a character.  */
 566
 567 EMACS_INT
 568 chars_in_text (ptr, nbytes)
 569      const unsigned char *ptr;
 570      EMACS_INT nbytes;
 571 {
 572   /* current_buffer is null at early stages of Emacs initialization.  */
 573   if (current_buffer == 0
 574       || NILP (current_buffer->enable_multibyte_characters))
 575     return nbytes;
 576
 577   return multibyte_chars_in_text (ptr, nbytes);
 578 }
 579
 580 /* Return the number of characters in the NBYTES bytes at PTR.
 581    This works by looking at the contents and checking for multibyte
 582    sequences while assuming that there's no invalid sequence.  It
 583    ignores enable-multibyte-characters.  */
 584
 585 EMACS_INT
 586 multibyte_chars_in_text (ptr, nbytes)
 587      const unsigned char *ptr;
 588      EMACS_INT nbytes;
 589 {
 590   const unsigned char *endp = ptr + nbytes;
 591   int chars = 0;
 592
 593   while (ptr < endp)
 594     {
 595       int len = MULTIBYTE_LENGTH (ptr, endp);
 596
 597       if (len == 0)
 598         abort ();
 599       ptr += len;
 600       chars++;
 601     }
 602
 603   return chars;
 604 }
 605
 606 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 607    characters and bytes in it, and store them in *NCHARS and *NBYTES
 608    respectively.  On counting bytes, pay attention to that 8-bit
 609    characters not constructing a valid multibyte sequence are
 610    represented by 2-byte in a multibyte text.  */
 611
 612 void
 613 parse_str_as_multibyte (str, len, nchars, nbytes)
 614      const unsigned char *str;
 615      int len, *nchars, *nbytes;
 616 {
 617   const unsigned char *endp = str + len;
 618   int n, chars = 0, bytes = 0;
 619
 620   if (len >= MAX_MULTIBYTE_LENGTH)
 621     {
 622       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 623       while (str < adjusted_endp)
 624         {
 625           if (! CHAR_BYTE8_HEAD_P (*str)
 626               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 627             str += n, bytes += n;
 628           else
 629             str++, bytes += 2;
 630           chars++;
 631         }
 632     }
 633   while (str < endp)
 634     {
 635       if (! CHAR_BYTE8_HEAD_P (*str)
 636           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 637         str += n, bytes += n;
 638       else
 639         str++, bytes += 2;
 640       chars++;
 641     }
 642
 643   *nchars = chars;
 644   *nbytes = bytes;
 645   return;
 646 }
 647
 648 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 649    It actually converts only such 8-bit characters that don't contruct
 650    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 651    NCHARS is nonzero, set *NCHARS to the number of characters in the
 652    text.  It is assured that we can use LEN bytes at STR as a work
 653    area and that is enough.  Return the number of bytes of the
 654    resulting text.  */
 655
 656 int
 657 str_as_multibyte (str, len, nbytes, nchars)
 658      unsigned char *str;
 659      int len, nbytes, *nchars;
 660 {
 661   unsigned char *p = str, *endp = str + nbytes;
 662   unsigned char *to;
 663   int chars = 0;
 664   int n;
 665
 666   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 667     {
 668       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 669       while (p < adjusted_endp
 670              && ! CHAR_BYTE8_HEAD_P (*p)
 671              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 672         p += n, chars++;
 673     }
 674   while (p < endp
 675          && ! CHAR_BYTE8_HEAD_P (*p)
 676          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 677     p += n, chars++;
 678   if (nchars)
 679     *nchars = chars;
 680   if (p == endp)
 681     return nbytes;
 682
 683   to = p;
 684   nbytes = endp - p;
 685   endp = str + len;
 686   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 687   p = endp - nbytes;
 688
 689   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 690     {
 691       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 692       while (p < adjusted_endp)
 693         {
 694           if (! CHAR_BYTE8_HEAD_P (*p)
 695               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 696             {
 697               while (n--)
 698                 *to++ = *p++;
 699             }
 700           else
 701             {
 702               int c = *p++;
 703               c = BYTE8_TO_CHAR (c);
 704               to += CHAR_STRING (c, to);
 705             }
 706         }
 707       chars++;
 708     }
 709   while (p < endp)
 710     {
 711       if (! CHAR_BYTE8_HEAD_P (*p)
 712           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 713         {
 714           while (n--)
 715             *to++ = *p++;
 716         }
 717       else
 718         {
 719           int c = *p++;
 720           c = BYTE8_TO_CHAR (c);
 721           to += CHAR_STRING (c, to);
 722         }
 723       chars++;
 724     }
 725   if (nchars)
 726     *nchars = chars;
 727   return (to - str);
 728 }
 729
 730 /* Parse unibyte string at STR of LEN bytes, and return the number of
 731    bytes it may ocupy when converted to multibyte string by
 732    `str_to_multibyte'.  */
 733
 734 int
 735 parse_str_to_multibyte (str, len)
 736      unsigned char *str;
 737      int len;
 738 {
 739   unsigned char *endp = str + len;
 740   int bytes;
 741
 742   for (bytes = 0; str < endp; str++)
 743     bytes += (*str < 0x80) ? 1 : 2;
 744   return bytes;
 745 }
 746
 747
 748 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 749    that contains the same single-byte characters.  It actually
 750    converts all 8-bit characters to multibyte forms.  It is assured
 751    that we can use LEN bytes at STR as a work area and that is
 752    enough.  */
 753
 754 int
 755 str_to_multibyte (str, len, bytes)
 756      unsigned char *str;
 757      int len, bytes;
 758 {
 759   unsigned char *p = str, *endp = str + bytes;
 760   unsigned char *to;
 761
 762   while (p < endp && *p < 0x80) p++;
 763   if (p == endp)
 764     return bytes;
 765   to = p;
 766   bytes = endp - p;
 767   endp = str + len;
 768   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 769   p = endp - bytes;
 770   while (p < endp)
 771     {
 772       int c = *p++;
 773
 774       if (c >= 0x80)
 775         c = BYTE8_TO_CHAR (c);
 776       to += CHAR_STRING (c, to);
 777     }
 778   return (to - str);
 779 }
 780
 781 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 782    actually converts characters in the range 0x80..0xFF to
 783    unibyte.  */
 784
 785 int
 786 str_as_unibyte (str, bytes)
 787      unsigned char *str;
 788      int bytes;
 789 {
 790   const unsigned char *p = str, *endp = str + bytes;
 791   unsigned char *to;
 792   int c, len;
 793
 794   while (p < endp)
 795     {
 796       c = *p;
 797       len = BYTES_BY_CHAR_HEAD (c);
 798       if (CHAR_BYTE8_HEAD_P (c))
 799         break;
 800       p += len;
 801     }
 802   to = str + (p - str);
 803   while (p < endp)
 804     {
 805       c = *p;
 806       len = BYTES_BY_CHAR_HEAD (c);
 807       if (CHAR_BYTE8_HEAD_P (c))
 808         {
 809           c = STRING_CHAR_ADVANCE (p);
 810           *to++ = CHAR_TO_BYTE8 (c);
 811         }
 812       else
 813         {
 814           while (len--) *to++ = *p++;
 815         }
 816     }
 817   return (to - str);
 818 }
 819
 820 /* Convert eight-bit chars in SRC (in multibyte form) to the
 821    corresponding byte and store in DST.  CHARS is the number of
 822    characters in SRC.  The value is the number of bytes stored in DST.
 823    Usually, the value is the same as CHARS, but is less than it if SRC
 824    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 825    is nonzero, a Latin-1 character is accepted and converted to a byte
 826    of that character code.
 827    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 828
 829 EMACS_INT
 830 str_to_unibyte (src, dst, chars, accept_latin_1)
 831      const unsigned char *src;
 832      unsigned char *dst;
 833      EMACS_INT chars;
 834      int accept_latin_1;
 835 {
 836   EMACS_INT i;
 837
 838   for (i = 0; i < chars; i++)
 839     {
 840       int c = STRING_CHAR_ADVANCE (src);
 841
 842       if (CHAR_BYTE8_P (c))
 843         c = CHAR_TO_BYTE8 (c);
 844       else if (! ASCII_CHAR_P (c)
 845                && (! accept_latin_1 || c >= 0x100))
 846         return i;
 847       *dst++ = c;
 848     }
 849   return i;
 850 }
 851
 852
 853 int
 854 string_count_byte8 (string)
 855      Lisp_Object string;
 856 {
 857   int multibyte = STRING_MULTIBYTE (string);
 858   int nbytes = SBYTES (string);
 859   unsigned char *p = SDATA (string);
 860   unsigned char *pend = p + nbytes;
 861   int count = 0;
 862   int c, len;
 863
 864   if (multibyte)
 865     while (p < pend)
 866       {
 867         c = *p;
 868         len = BYTES_BY_CHAR_HEAD (c);
 869
 870         if (CHAR_BYTE8_HEAD_P (c))
 871           count++;
 872         p += len;
 873       }
 874   else
 875     while (p < pend)
 876       {
 877         if (*p++ >= 0x80)
 878           count++;
 879       }
 880   return count;
 881 }
 882
 883
 884 Lisp_Object
 885 string_escape_byte8 (string)
 886      Lisp_Object string;
 887 {
 888   int nchars = SCHARS (string);
 889   int nbytes = SBYTES (string);
 890   int multibyte = STRING_MULTIBYTE (string);
 891   int byte8_count;
 892   const unsigned char *src, *src_end;
 893   unsigned char *dst;
 894   Lisp_Object val;
 895   int c, len;
 896
 897   if (multibyte && nchars == nbytes)
 898     return string;
 899
 900   byte8_count = string_count_byte8 (string);
 901
 902   if (byte8_count == 0)
 903     return string;
 904
 905   if (multibyte)
 906     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 907     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 908                                         nbytes + byte8_count * 2);
 909   else
 910     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 911     val = make_uninit_string (nbytes + byte8_count * 3);
 912
 913   src = SDATA (string);
 914   src_end = src + nbytes;
 915   dst = SDATA (val);
 916   if (multibyte)
 917     while (src < src_end)
 918       {
 919         c = *src;
 920         len = BYTES_BY_CHAR_HEAD (c);
 921
 922         if (CHAR_BYTE8_HEAD_P (c))
 923           {
 924             c = STRING_CHAR_ADVANCE (src);
 925             c = CHAR_TO_BYTE8 (c);
 926             sprintf ((char *) dst, "\\%03o", c);
 927             dst += 4;
 928           }
 929         else
 930           while (len--) *dst++ = *src++;
 931       }
 932   else
 933     while (src < src_end)
 934       {
 935         c = *src++;
 936         if (c >= 0x80)
 937           {
 938             sprintf ((char *) dst, "\\%03o", c);
 939             dst += 4;
 940           }
 941         else
 942           *dst++ = c;
 943       }
 944   return val;
 945 }
 946
 947 \f
 948 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 949        doc: /*
 950 Concatenate all the argument characters and make the result a string.
 951 usage: (string &rest CHARACTERS)  */)
 952      (n, args)
 953      int n;
 954      Lisp_Object *args;
 955 {
 956   int i, c;
 957   unsigned char *buf, *p;
 958   Lisp_Object str;
 959   USE_SAFE_ALLOCA;
 960
 961   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 962   p = buf;
 963
 964   for (i = 0; i < n; i++)
 965     {
 966       CHECK_CHARACTER (args[i]);
 967       c = XINT (args[i]);
 968       p += CHAR_STRING (c, p);
 969     }
 970
 971   str = make_string_from_bytes ((char *) buf, n, p - buf);
 972   SAFE_FREE ();
 973   return str;
 974 }
 975
 976 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 977        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 978 usage: (unibyte-string &rest BYTES)  */)
 979      (n, args)
 980      int n;
 981      Lisp_Object *args;
 982 {
 983   int i, c;
 984   unsigned char *buf, *p;
 985   Lisp_Object str;
 986   USE_SAFE_ALLOCA;
 987
 988   SAFE_ALLOCA (buf, unsigned char *, n);
 989   p = buf;
 990
 991   for (i = 0; i < n; i++)
 992     {
 993       CHECK_NATNUM (args[i]);
 994       c = XFASTINT (args[i]);
 995       if (c >= 256)
 996         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 997       *p++ = c;
 998     }
 999
1000   str = make_string_from_bytes ((char *) buf, n, p - buf);
1001   SAFE_FREE ();
1002   return str;
1003 }
1004
1005 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1006        Schar_resolve_modifiers, 1, 1, 0,
1007        doc: /* Resolve modifiers in the character CHAR.
1008 The value is a character with modifiers resolved into the character
1009 code.  Unresolved modifiers are kept in the value.
1010 usage: (char-resolve-modifiers CHAR)  */)
1011      (character)
1012      Lisp_Object character;
1013 {
1014   int c;
1015
1016   CHECK_NUMBER (character);
1017   c = XINT (character);
1018   return make_number (char_resolve_modifier_mask (c));
1019 }
1020
1021 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1022        doc: /* Return a byte value of a character at point.
1023 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1024 a byte value.
1025 Optional 2nd arg STRING, if non-nil, is a string of which first
1026 character is a target to get a byte value.  In this case, POSITION, if
1027 non-nil, is an index of a target character in the string.
1028
1029 If the current buffer (or STRING) is multibyte, and the target
1030 character is not ASCII nor 8-bit character, an error is signalled.  */)
1031      (position, string)
1032      Lisp_Object position, string;
1033 {
1034   int c;
1035   EMACS_INT pos;
1036   unsigned char *p;
1037
1038   if (NILP (string))
1039     {
1040       if (NILP (position))
1041         {
1042           p = PT_ADDR;
1043         }
1044       else
1045         {
1046           CHECK_NUMBER_COERCE_MARKER (position);
1047           if (XINT (position) < BEGV || XINT (position) >= ZV)
1048             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1049           pos = XFASTINT (position);
1050           p = CHAR_POS_ADDR (pos);
1051         }
1052       if (NILP (current_buffer->enable_multibyte_characters))
1053         return make_number (*p);
1054     }
1055   else
1056     {
1057       CHECK_STRING (string);
1058       if (NILP (position))
1059         {
1060           p = SDATA (string);
1061         }
1062       else
1063         {
1064           CHECK_NATNUM (position);
1065           if (XINT (position) >= SCHARS (string))
1066             args_out_of_range (string, position);
1067           pos = XFASTINT (position);
1068           p = SDATA (string) + string_char_to_byte (string, pos);
1069         }
1070       if (! STRING_MULTIBYTE (string))
1071         return make_number (*p);
1072     }
1073   c = STRING_CHAR (p);
1074   if (CHAR_BYTE8_P (c))
1075     c = CHAR_TO_BYTE8 (c);
1076   else if (! ASCII_CHAR_P (c))
1077     error ("Not an ASCII nor an 8-bit character: %d", c);
1078   return make_number (c);
1079 }
1080
1081
1082 void
1083 init_character_once ()
1084 {
1085 }
1086
1087 #ifdef emacs
1088
1089 void
1090 syms_of_character ()
1091 {
1092   DEFSYM (Qcharacterp, "characterp");
1093   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1094
1095   staticpro (&Vchar_unify_table);
1096   Vchar_unify_table = Qnil;
1097
1098   defsubr (&Smax_char);
1099   defsubr (&Scharacterp);
1100   defsubr (&Sunibyte_char_to_multibyte);
1101   defsubr (&Smultibyte_char_to_unibyte);
1102   defsubr (&Schar_bytes);
1103   defsubr (&Schar_width);
1104   defsubr (&Sstring_width);
1105   defsubr (&Schar_direction);
1106   defsubr (&Sstring);
1107   defsubr (&Sunibyte_string);
1108   defsubr (&Schar_resolve_modifiers);
1109   defsubr (&Sget_byte);
1110
1111   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1112                doc: /*
1113 Vector recording all translation tables ever defined.
1114 Each element is a pair (SYMBOL . TABLE) relating the table to the
1115 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1116   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1117
1118   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1119                doc: /*
1120 A char-table for characters which invoke auto-filling.
1121 Such characters have value t in this table.  */);
1122   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1123   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1124   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1125
1126   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1127                doc: /*
1128 A char-table for width (columns) of each character.  */);
1129   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1130   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1131   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1132                         make_number (4));
1133
1134   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1135                doc: /* A char-table for direction of each character.  */);
1136   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1137
1138   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1139                doc: /* A char-table for each printable character.  */);
1140   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1141   Fset_char_table_range (Vprintable_chars,
1142                          Fcons (make_number (32), make_number (126)), Qt);
1143   Fset_char_table_range (Vprintable_chars,
1144                          Fcons (make_number (160),
1145                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1146
1147   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1148                doc: /* Char table of script symbols.
1149 It has one extra slot whose value is a list of script symbols.  */);
1150
1151   /* Intern this now in case it isn't already done.
1152      Setting this variable twice is harmless.
1153      But don't staticpro it here--that is done in alloc.c.  */
1154   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1155   DEFSYM (Qchar_script_table, "char-script-table");
1156   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1157   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1158
1159   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1160                doc: /* Alist of scripts vs the representative characters.
1161 Each element is a cons (SCRIPT . CHARS).
1162 SCRIPT is a symbol representing a script or a subgroup of a script.
1163 CHARS is a list or a vector of characters.
1164 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1165 If it is a vector, one of the characters in the vector is necessary.
1166 This variable is used to find a font for a specific script.  */);
1167   Vscript_representative_chars = Qnil;
1168
1169   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1170                doc: /* Char table of Unicode's "General Category".
1171 All Unicode characters have one of the following values (symbol):
1172   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1173   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1174 See The Unicode Standard for the meaning of those values.  */);
1175   /* The correct char-table is setup in characters.el.  */
1176   Vunicode_category_table = Qnil;
1177 }
1178
1179 #endif /* emacs */
1180
1181 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1182    (do not change this comment) */