code.delx.au - gnu-emacs/blob - src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  61 unsigned char *_fetch_multibyte_char_p;
  62
  63 static Lisp_Object Qchar_script_table;
  64
  65 \f
  66
  67 /* If character code C has modifier masks, reflect them to the
  68    character code if possible.  Return the resulting code.  */
  69
  70 int
  71 char_resolve_modifier_mask (int c)
  72 {
  73   /* A non-ASCII character can't reflect modifier bits to the code.  */
  74   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  75     return c;
  76
  77   /* For Meta, Shift, and Control modifiers, we need special care.  */
  78   if (c & CHAR_SHIFT)
  79     {
  80       /* Shift modifier is valid only with [A-Za-z].  */
  81       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  82         c &= ~CHAR_SHIFT;
  83       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  84         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  85       /* Shift modifier for control characters and SPC is ignored.  */
  86       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  87         c &= ~CHAR_SHIFT;
  88     }
  89   if (c & CHAR_CTL)
  90     {
  91       /* Simulate the code in lread.c.  */
  92       /* Allow `\C- ' and `\C-?'.  */
  93       if ((c & 0377) == ' ')
  94         c &= ~0177 & ~ CHAR_CTL;
  95       else if ((c & 0377) == '?')
  96         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  97       /* ASCII control chars are made from letters (both cases),
  98          as well as the non-letters within 0100...0137.  */
  99       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 102         c &= (037 | (~0177 & ~CHAR_CTL));
 103     }
 104 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 105   if (c & CHAR_META)
 106     {
 107       /* Move the meta bit to the right place for a string.  */
 108       c = (c & ~CHAR_META) | 0x80;
 109     }
 110 #endif
 111
 112   return c;
 113 }
 114
 115
 116 /* Store multibyte form of character C at P.  If C has modifier bits,
 117    handle them appropriately.  */
 118
 119 int
 120 char_string (unsigned int c, unsigned char *p)
 121 {
 122   int bytes;
 123
 124   if (c & CHAR_MODIFIER_MASK)
 125     {
 126       c = char_resolve_modifier_mask (c);
 127       /* If C still has any modifier bits, just ignore it.  */
 128       c &= ~CHAR_MODIFIER_MASK;
 129     }
 130
 131   MAYBE_UNIFY_CHAR (c);
 132
 133   if (c <= MAX_3_BYTE_CHAR)
 134     {
 135       bytes = CHAR_STRING (c, p);
 136     }
 137   else if (c <= MAX_4_BYTE_CHAR)
 138     {
 139       p[0] = (0xF0 | (c >> 18));
 140       p[1] = (0x80 | ((c >> 12) & 0x3F));
 141       p[2] = (0x80 | ((c >> 6) & 0x3F));
 142       p[3] = (0x80 | (c & 0x3F));
 143       bytes = 4;
 144     }
 145   else if (c <= MAX_5_BYTE_CHAR)
 146     {
 147       p[0] = 0xF8;
 148       p[1] = (0x80 | ((c >> 18) & 0x0F));
 149       p[2] = (0x80 | ((c >> 12) & 0x3F));
 150       p[3] = (0x80 | ((c >> 6) & 0x3F));
 151       p[4] = (0x80 | (c & 0x3F));
 152       bytes = 5;
 153     }
 154   else if (c <= MAX_CHAR)
 155     {
 156       c = CHAR_TO_BYTE8 (c);
 157       bytes = BYTE8_STRING (c, p);
 158     }
 159   else
 160     error ("Invalid character: %x", c);
 161
 162   return bytes;
 163 }
 164
 165
 166 /* Return a character whose multibyte form is at P.  If LEN is not
 167    NULL, it must be a pointer to integer.  In that case, set *LEN to
 168    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 169    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 170    the ending address (i.e., the starting address of the next
 171    character) of the multibyte form.  */
 172
 173 int
 174 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If no translation is
 211    found in TABLE, return the untranslated character.  If TABLE is a list,
 212    elements are char tables.  In that case, recursively translate C by all the
 213    tables in the list.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.
 262 usage: (characterp OBJECT)  */)
 263   (Lisp_Object object, Lisp_Object ignore)
 264 {
 265   return (CHARACTERP (object) ? Qt : Qnil);
 266 }
 267
 268 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 269        doc: /* Return the character of the maximum code.  */)
 270   (void)
 271 {
 272   return make_number (MAX_CHAR);
 273 }
 274
 275 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 276        Sunibyte_char_to_multibyte, 1, 1, 0,
 277        doc: /* Convert the byte CH to multibyte character.  */)
 278   (Lisp_Object ch)
 279 {
 280   int c;
 281
 282   CHECK_CHARACTER (ch);
 283   c = XFASTINT (ch);
 284   if (c >= 0x100)
 285     error ("Not a unibyte character: %d", c);
 286   MAKE_CHAR_MULTIBYTE (c);
 287   return make_number (c);
 288 }
 289
 290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 291        Smultibyte_char_to_unibyte, 1, 1, 0,
 292        doc: /* Convert the multibyte character CH to a byte.
 293 If the multibyte character does not represent a byte, return -1.  */)
 294   (Lisp_Object ch)
 295 {
 296   int cm;
 297
 298   CHECK_CHARACTER (ch);
 299   cm = XFASTINT (ch);
 300   if (cm < 256)
 301     /* Can't distinguish a byte read from a unibyte buffer from
 302        a latin1 char, so let's let it slide.  */
 303     return ch;
 304   else
 305     {
 306       int cu = CHAR_TO_BYTE_SAFE (cm);
 307       return make_number (cu);
 308     }
 309 }
 310
 311
 312 /* Return width (columns) of C considering the buffer display table DP. */
 313
 314 static int
 315 char_width (int c, struct Lisp_Char_Table *dp)
 316 {
 317   int width = CHAR_WIDTH (c);
 318
 319   if (dp)
 320     {
 321       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 322       int i;
 323
 324       if (VECTORP (disp))
 325         for (i = 0, width = 0; i < ASIZE (disp); i++)
 326           {
 327             ch = AREF (disp, i);
 328             if (CHARACTERP (ch))
 329               width += CHAR_WIDTH (XFASTINT (ch));
 330           }
 331     }
 332   return width;
 333 }
 334
 335
 336 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 337        doc: /* Return width of CHAR when displayed in the current buffer.
 338 The width is measured by how many columns it occupies on the screen.
 339 Tab is taken to occupy `tab-width' columns.
 340 usage: (char-width CHAR)  */)
 341   (Lisp_Object ch)
 342 {
 343   int c, width;
 344
 345   CHECK_CHARACTER (ch);
 346   c = XINT (ch);
 347   width = char_width (c, buffer_display_table ());
 348   return make_number (width);
 349 }
 350
 351 /* Return width of string STR of length LEN when displayed in the
 352    current buffer.  The width is measured by how many columns it
 353    occupies on the screen.  If PRECISION > 0, return the width of
 354    longest substring that doesn't exceed PRECISION, and set number of
 355    characters and bytes of the substring in *NCHARS and *NBYTES
 356    respectively.  */
 357
 358 EMACS_INT
 359 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 360                 EMACS_INT *nchars, EMACS_INT *nbytes)
 361 {
 362   EMACS_INT i = 0, i_byte = 0;
 363   EMACS_INT width = 0;
 364   struct Lisp_Char_Table *dp = buffer_display_table ();
 365
 366   while (i_byte < len)
 367     {
 368       int bytes;
 369       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 370       int thiswidth = char_width (c, dp);
 371
 372       if (precision > 0
 373           && (width + thiswidth > precision))
 374         {
 375           *nchars = i;
 376           *nbytes = i_byte;
 377           return width;
 378         }
 379       i++;
 380       i_byte += bytes;
 381       width += thiswidth;
 382   }
 383
 384   if (precision > 0)
 385     {
 386       *nchars = i;
 387       *nbytes = i_byte;
 388     }
 389
 390   return width;
 391 }
 392
 393 /* Return width of string STR of length LEN when displayed in the
 394    current buffer.  The width is measured by how many columns it
 395    occupies on the screen.  */
 396
 397 EMACS_INT
 398 strwidth (const char *str, EMACS_INT len)
 399 {
 400   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 401 }
 402
 403 /* Return width of Lisp string STRING when displayed in the current
 404    buffer.  The width is measured by how many columns it occupies on
 405    the screen while paying attention to compositions.  If PRECISION >
 406    0, return the width of longest substring that doesn't exceed
 407    PRECISION, and set number of characters and bytes of the substring
 408    in *NCHARS and *NBYTES respectively.  */
 409
 410 EMACS_INT
 411 lisp_string_width (Lisp_Object string, EMACS_INT precision,
 412                    EMACS_INT *nchars, EMACS_INT *nbytes)
 413 {
 414   EMACS_INT len = SCHARS (string);
 415   /* This set multibyte to 0 even if STRING is multibyte when it
 416      contains only ascii and eight-bit-graphic, but that's
 417      intentional.  */
 418   int multibyte = len < SBYTES (string);
 419   unsigned char *str = SDATA (string);
 420   EMACS_INT i = 0, i_byte = 0;
 421   EMACS_INT width = 0;
 422   struct Lisp_Char_Table *dp = buffer_display_table ();
 423
 424   while (i < len)
 425     {
 426       EMACS_INT chars, bytes, thiswidth;
 427       Lisp_Object val;
 428       ptrdiff_t cmp_id;
 429       EMACS_INT ignore, end;
 430
 431       if (find_composition (i, -1, &ignore, &end, &val, string)
 432           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 433               >= 0))
 434         {
 435           thiswidth = composition_table[cmp_id]->width;
 436           chars = end - i;
 437           bytes = string_char_to_byte (string, end) - i_byte;
 438         }
 439       else
 440         {
 441           int c;
 442
 443           if (multibyte)
 444             {
 445               int cbytes;
 446               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 447               bytes = cbytes;
 448             }
 449           else
 450             c = str[i_byte], bytes = 1;
 451           chars = 1;
 452           thiswidth = char_width (c, dp);
 453         }
 454
 455       if (precision <= 0)
 456         {
 457 #ifdef emacs
 458           if (INT_ADD_OVERFLOW (width, thiswidth))
 459             string_overflow ();
 460 #endif
 461         }
 462       else if (precision - width < thiswidth)
 463         {
 464           *nchars = i;
 465           *nbytes = i_byte;
 466           return width;
 467         }
 468       i += chars;
 469       i_byte += bytes;
 470       width += thiswidth;
 471     }
 472
 473   if (precision > 0)
 474     {
 475       *nchars = i;
 476       *nbytes = i_byte;
 477     }
 478
 479   return width;
 480 }
 481
 482 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 483        doc: /* Return width of STRING when displayed in the current buffer.
 484 Width is measured by how many columns it occupies on the screen.
 485 When calculating width of a multibyte character in STRING,
 486 only the base leading-code is considered; the validity of
 487 the following bytes is not checked.  Tabs in STRING are always
 488 taken to occupy `tab-width' columns.
 489 usage: (string-width STRING)  */)
 490   (Lisp_Object str)
 491 {
 492   Lisp_Object val;
 493
 494   CHECK_STRING (str);
 495   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 496   return val;
 497 }
 498
 499 /* Return the number of characters in the NBYTES bytes at PTR.
 500    This works by looking at the contents and checking for multibyte
 501    sequences while assuming that there's no invalid sequence.
 502    However, if the current buffer has enable-multibyte-characters =
 503    nil, we treat each byte as a character.  */
 504
 505 EMACS_INT
 506 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 507 {
 508   /* current_buffer is null at early stages of Emacs initialization.  */
 509   if (current_buffer == 0
 510       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 511     return nbytes;
 512
 513   return multibyte_chars_in_text (ptr, nbytes);
 514 }
 515
 516 /* Return the number of characters in the NBYTES bytes at PTR.
 517    This works by looking at the contents and checking for multibyte
 518    sequences while assuming that there's no invalid sequence.  It
 519    ignores enable-multibyte-characters.  */
 520
 521 EMACS_INT
 522 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 523 {
 524   const unsigned char *endp = ptr + nbytes;
 525   EMACS_INT chars = 0;
 526
 527   while (ptr < endp)
 528     {
 529       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 530
 531       if (len == 0)
 532         abort ();
 533       ptr += len;
 534       chars++;
 535     }
 536
 537   return chars;
 538 }
 539
 540 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 541    characters and bytes in it, and store them in *NCHARS and *NBYTES
 542    respectively.  On counting bytes, pay attention to that 8-bit
 543    characters not constructing a valid multibyte sequence are
 544    represented by 2-byte in a multibyte text.  */
 545
 546 void
 547 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 548                         EMACS_INT *nchars, EMACS_INT *nbytes)
 549 {
 550   const unsigned char *endp = str + len;
 551   EMACS_INT n, chars = 0, bytes = 0;
 552
 553   if (len >= MAX_MULTIBYTE_LENGTH)
 554     {
 555       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 556       while (str < adjusted_endp)
 557         {
 558           if (! CHAR_BYTE8_HEAD_P (*str)
 559               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 560             str += n, bytes += n;
 561           else
 562             str++, bytes += 2;
 563           chars++;
 564         }
 565     }
 566   while (str < endp)
 567     {
 568       if (! CHAR_BYTE8_HEAD_P (*str)
 569           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 570         str += n, bytes += n;
 571       else
 572         str++, bytes += 2;
 573       chars++;
 574     }
 575
 576   *nchars = chars;
 577   *nbytes = bytes;
 578   return;
 579 }
 580
 581 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 582    It actually converts only such 8-bit characters that don't construct
 583    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 584    NCHARS is nonzero, set *NCHARS to the number of characters in the
 585    text.  It is assured that we can use LEN bytes at STR as a work
 586    area and that is enough.  Return the number of bytes of the
 587    resulting text.  */
 588
 589 EMACS_INT
 590 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 591                   EMACS_INT *nchars)
 592 {
 593   unsigned char *p = str, *endp = str + nbytes;
 594   unsigned char *to;
 595   EMACS_INT chars = 0;
 596   int n;
 597
 598   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 599     {
 600       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 601       while (p < adjusted_endp
 602              && ! CHAR_BYTE8_HEAD_P (*p)
 603              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 604         p += n, chars++;
 605     }
 606   while (p < endp
 607          && ! CHAR_BYTE8_HEAD_P (*p)
 608          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 609     p += n, chars++;
 610   if (nchars)
 611     *nchars = chars;
 612   if (p == endp)
 613     return nbytes;
 614
 615   to = p;
 616   nbytes = endp - p;
 617   endp = str + len;
 618   memmove (endp - nbytes, p, nbytes);
 619   p = endp - nbytes;
 620
 621   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 622     {
 623       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 624       while (p < adjusted_endp)
 625         {
 626           if (! CHAR_BYTE8_HEAD_P (*p)
 627               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 628             {
 629               while (n--)
 630                 *to++ = *p++;
 631             }
 632           else
 633             {
 634               int c = *p++;
 635               c = BYTE8_TO_CHAR (c);
 636               to += CHAR_STRING (c, to);
 637             }
 638         }
 639       chars++;
 640     }
 641   while (p < endp)
 642     {
 643       if (! CHAR_BYTE8_HEAD_P (*p)
 644           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 645         {
 646           while (n--)
 647             *to++ = *p++;
 648         }
 649       else
 650         {
 651           int c = *p++;
 652           c = BYTE8_TO_CHAR (c);
 653           to += CHAR_STRING (c, to);
 654         }
 655       chars++;
 656     }
 657   if (nchars)
 658     *nchars = chars;
 659   return (to - str);
 660 }
 661
 662 /* Parse unibyte string at STR of LEN bytes, and return the number of
 663    bytes it may occupy when converted to multibyte string by
 664    `str_to_multibyte'.  */
 665
 666 EMACS_INT
 667 count_size_as_multibyte (const unsigned char *str, EMACS_INT len)
 668 {
 669   const unsigned char *endp = str + len;
 670   EMACS_INT bytes;
 671
 672   for (bytes = 0; str < endp; str++)
 673     {
 674       int n = *str < 0x80 ? 1 : 2;
 675       if (INT_ADD_OVERFLOW (bytes, n))
 676         string_overflow ();
 677       bytes += n;
 678     }
 679   return bytes;
 680 }
 681
 682
 683 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 684    that contains the same single-byte characters.  It actually
 685    converts all 8-bit characters to multibyte forms.  It is assured
 686    that we can use LEN bytes at STR as a work area and that is
 687    enough.  */
 688
 689 EMACS_INT
 690 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 691 {
 692   unsigned char *p = str, *endp = str + bytes;
 693   unsigned char *to;
 694
 695   while (p < endp && *p < 0x80) p++;
 696   if (p == endp)
 697     return bytes;
 698   to = p;
 699   bytes = endp - p;
 700   endp = str + len;
 701   memmove (endp - bytes, p, bytes);
 702   p = endp - bytes;
 703   while (p < endp)
 704     {
 705       int c = *p++;
 706
 707       if (c >= 0x80)
 708         c = BYTE8_TO_CHAR (c);
 709       to += CHAR_STRING (c, to);
 710     }
 711   return (to - str);
 712 }
 713
 714 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 715    actually converts characters in the range 0x80..0xFF to
 716    unibyte.  */
 717
 718 EMACS_INT
 719 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 720 {
 721   const unsigned char *p = str, *endp = str + bytes;
 722   unsigned char *to;
 723   int c, len;
 724
 725   while (p < endp)
 726     {
 727       c = *p;
 728       len = BYTES_BY_CHAR_HEAD (c);
 729       if (CHAR_BYTE8_HEAD_P (c))
 730         break;
 731       p += len;
 732     }
 733   to = str + (p - str);
 734   while (p < endp)
 735     {
 736       c = *p;
 737       len = BYTES_BY_CHAR_HEAD (c);
 738       if (CHAR_BYTE8_HEAD_P (c))
 739         {
 740           c = STRING_CHAR_ADVANCE (p);
 741           *to++ = CHAR_TO_BYTE8 (c);
 742         }
 743       else
 744         {
 745           while (len--) *to++ = *p++;
 746         }
 747     }
 748   return (to - str);
 749 }
 750
 751 /* Convert eight-bit chars in SRC (in multibyte form) to the
 752    corresponding byte and store in DST.  CHARS is the number of
 753    characters in SRC.  The value is the number of bytes stored in DST.
 754    Usually, the value is the same as CHARS, but is less than it if SRC
 755    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 756    is nonzero, a Latin-1 character is accepted and converted to a byte
 757    of that character code.
 758    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 759
 760 EMACS_INT
 761 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 762 {
 763   EMACS_INT i;
 764
 765   for (i = 0; i < chars; i++)
 766     {
 767       int c = STRING_CHAR_ADVANCE (src);
 768
 769       if (CHAR_BYTE8_P (c))
 770         c = CHAR_TO_BYTE8 (c);
 771       else if (! ASCII_CHAR_P (c)
 772                && (! accept_latin_1 || c >= 0x100))
 773         return i;
 774       *dst++ = c;
 775     }
 776   return i;
 777 }
 778
 779
 780 static EMACS_INT
 781 string_count_byte8 (Lisp_Object string)
 782 {
 783   int multibyte = STRING_MULTIBYTE (string);
 784   EMACS_INT nbytes = SBYTES (string);
 785   unsigned char *p = SDATA (string);
 786   unsigned char *pend = p + nbytes;
 787   EMACS_INT count = 0;
 788   int c, len;
 789
 790   if (multibyte)
 791     while (p < pend)
 792       {
 793         c = *p;
 794         len = BYTES_BY_CHAR_HEAD (c);
 795
 796         if (CHAR_BYTE8_HEAD_P (c))
 797           count++;
 798         p += len;
 799       }
 800   else
 801     while (p < pend)
 802       {
 803         if (*p++ >= 0x80)
 804           count++;
 805       }
 806   return count;
 807 }
 808
 809
 810 Lisp_Object
 811 string_escape_byte8 (Lisp_Object string)
 812 {
 813   EMACS_INT nchars = SCHARS (string);
 814   EMACS_INT nbytes = SBYTES (string);
 815   int multibyte = STRING_MULTIBYTE (string);
 816   EMACS_INT byte8_count;
 817   const unsigned char *src, *src_end;
 818   unsigned char *dst;
 819   Lisp_Object val;
 820   int c, len;
 821
 822   if (multibyte && nchars == nbytes)
 823     return string;
 824
 825   byte8_count = string_count_byte8 (string);
 826
 827   if (byte8_count == 0)
 828     return string;
 829
 830   if (multibyte)
 831     {
 832       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 833           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 834         string_overflow ();
 835
 836       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 837       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 838                                           nbytes + byte8_count * 2);
 839     }
 840   else
 841     {
 842       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 843         string_overflow ();
 844
 845       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 846       val = make_uninit_string (nbytes + byte8_count * 3);
 847     }
 848
 849   src = SDATA (string);
 850   src_end = src + nbytes;
 851   dst = SDATA (val);
 852   if (multibyte)
 853     while (src < src_end)
 854       {
 855         c = *src;
 856         len = BYTES_BY_CHAR_HEAD (c);
 857
 858         if (CHAR_BYTE8_HEAD_P (c))
 859           {
 860             c = STRING_CHAR_ADVANCE (src);
 861             c = CHAR_TO_BYTE8 (c);
 862             sprintf ((char *) dst, "\\%03o", c);
 863             dst += 4;
 864           }
 865         else
 866           while (len--) *dst++ = *src++;
 867       }
 868   else
 869     while (src < src_end)
 870       {
 871         c = *src++;
 872         if (c >= 0x80)
 873           {
 874             sprintf ((char *) dst, "\\%03o", c);
 875             dst += 4;
 876           }
 877         else
 878           *dst++ = c;
 879       }
 880   return val;
 881 }
 882
 883 \f
 884 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 885        doc: /*
 886 Concatenate all the argument characters and make the result a string.
 887 usage: (string &rest CHARACTERS)  */)
 888   (ptrdiff_t n, Lisp_Object *args)
 889 {
 890   ptrdiff_t i;
 891   int c;
 892   unsigned char *buf, *p;
 893   Lisp_Object str;
 894   USE_SAFE_ALLOCA;
 895
 896   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 897   p = buf;
 898
 899   for (i = 0; i < n; i++)
 900     {
 901       CHECK_CHARACTER (args[i]);
 902       c = XINT (args[i]);
 903       p += CHAR_STRING (c, p);
 904     }
 905
 906   str = make_string_from_bytes ((char *) buf, n, p - buf);
 907   SAFE_FREE ();
 908   return str;
 909 }
 910
 911 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 912        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 913 usage: (unibyte-string &rest BYTES)  */)
 914   (ptrdiff_t n, Lisp_Object *args)
 915 {
 916   ptrdiff_t i;
 917   int c;
 918   unsigned char *buf, *p;
 919   Lisp_Object str;
 920   USE_SAFE_ALLOCA;
 921
 922   SAFE_ALLOCA (buf, unsigned char *, n);
 923   p = buf;
 924
 925   for (i = 0; i < n; i++)
 926     {
 927       CHECK_NATNUM (args[i]);
 928       c = XFASTINT (args[i]);
 929       if (c >= 256)
 930         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 931       *p++ = c;
 932     }
 933
 934   str = make_string_from_bytes ((char *) buf, n, p - buf);
 935   SAFE_FREE ();
 936   return str;
 937 }
 938
 939 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 940        Schar_resolve_modifiers, 1, 1, 0,
 941        doc: /* Resolve modifiers in the character CHAR.
 942 The value is a character with modifiers resolved into the character
 943 code.  Unresolved modifiers are kept in the value.
 944 usage: (char-resolve-modifiers CHAR)  */)
 945   (Lisp_Object character)
 946 {
 947   int c;
 948
 949   CHECK_NUMBER (character);
 950   c = XINT (character);
 951   return make_number (char_resolve_modifier_mask (c));
 952 }
 953
 954 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 955        doc: /* Return a byte value of a character at point.
 956 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 957 a byte value.
 958 Optional 2nd arg STRING, if non-nil, is a string of which first
 959 character is a target to get a byte value.  In this case, POSITION, if
 960 non-nil, is an index of a target character in the string.
 961
 962 If the current buffer (or STRING) is multibyte, and the target
 963 character is not ASCII nor 8-bit character, an error is signaled.  */)
 964   (Lisp_Object position, Lisp_Object string)
 965 {
 966   int c;
 967   EMACS_INT pos;
 968   unsigned char *p;
 969
 970   if (NILP (string))
 971     {
 972       if (NILP (position))
 973         {
 974           p = PT_ADDR;
 975         }
 976       else
 977         {
 978           CHECK_NUMBER_COERCE_MARKER (position);
 979           if (XINT (position) < BEGV || XINT (position) >= ZV)
 980             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 981           pos = XFASTINT (position);
 982           p = CHAR_POS_ADDR (pos);
 983         }
 984       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 985         return make_number (*p);
 986     }
 987   else
 988     {
 989       CHECK_STRING (string);
 990       if (NILP (position))
 991         {
 992           p = SDATA (string);
 993         }
 994       else
 995         {
 996           CHECK_NATNUM (position);
 997           if (XINT (position) >= SCHARS (string))
 998             args_out_of_range (string, position);
 999           pos = XFASTINT (position);
1000           p = SDATA (string) + string_char_to_byte (string, pos);
1001         }
1002       if (! STRING_MULTIBYTE (string))
1003         return make_number (*p);
1004     }
1005   c = STRING_CHAR (p);
1006   if (CHAR_BYTE8_P (c))
1007     c = CHAR_TO_BYTE8 (c);
1008   else if (! ASCII_CHAR_P (c))
1009     error ("Not an ASCII nor an 8-bit character: %d", c);
1010   return make_number (c);
1011 }
1012
1013
1014 void
1015 init_character_once (void)
1016 {
1017 }
1018
1019 #ifdef emacs
1020
1021 void
1022 syms_of_character (void)
1023 {
1024   DEFSYM (Qcharacterp, "characterp");
1025   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1026
1027   staticpro (&Vchar_unify_table);
1028   Vchar_unify_table = Qnil;
1029
1030   defsubr (&Smax_char);
1031   defsubr (&Scharacterp);
1032   defsubr (&Sunibyte_char_to_multibyte);
1033   defsubr (&Smultibyte_char_to_unibyte);
1034   defsubr (&Schar_width);
1035   defsubr (&Sstring_width);
1036   defsubr (&Sstring);
1037   defsubr (&Sunibyte_string);
1038   defsubr (&Schar_resolve_modifiers);
1039   defsubr (&Sget_byte);
1040
1041   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1042                doc: /*
1043 Vector recording all translation tables ever defined.
1044 Each element is a pair (SYMBOL . TABLE) relating the table to the
1045 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1046   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1047
1048   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1049                doc: /*
1050 A char-table for characters which invoke auto-filling.
1051 Such characters have value t in this table.  */);
1052   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1053   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1054   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1055
1056   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1057                doc: /*
1058 A char-table for width (columns) of each character.  */);
1059   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1060   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1061   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1062                         make_number (4));
1063
1064   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1065                doc: /* A char-table for each printable character.  */);
1066   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1067   Fset_char_table_range (Vprintable_chars,
1068                          Fcons (make_number (32), make_number (126)), Qt);
1069   Fset_char_table_range (Vprintable_chars,
1070                          Fcons (make_number (160),
1071                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1072
1073   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1074                doc: /* Char table of script symbols.
1075 It has one extra slot whose value is a list of script symbols.  */);
1076
1077   /* Intern this now in case it isn't already done.
1078      Setting this variable twice is harmless.
1079      But don't staticpro it here--that is done in alloc.c.  */
1080   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1081   DEFSYM (Qchar_script_table, "char-script-table");
1082   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1083   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1084
1085   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1086                doc: /* Alist of scripts vs the representative characters.
1087 Each element is a cons (SCRIPT . CHARS).
1088 SCRIPT is a symbol representing a script or a subgroup of a script.
1089 CHARS is a list or a vector of characters.
1090 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1091 If it is a vector, one of the characters in the vector is necessary.
1092 This variable is used to find a font for a specific script.  */);
1093   Vscript_representative_chars = Qnil;
1094
1095   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1096                doc: /* Char table of Unicode's "General Category".
1097 All Unicode characters have one of the following values (symbol):
1098   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1099   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1100 See The Unicode Standard for the meaning of those values.  */);
1101   /* The correct char-table is setup in characters.el.  */
1102   Vunicode_category_table = Qnil;
1103 }
1104
1105 #endif /* emacs */