X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/d85d38392e338f66053a6a6f1017720660239338..76b6f7075970e492eba3cf3f4411fcfc4ff3bdcd:/src/character.c diff --git a/src/character.c b/src/character.c index 29a7f80ae4..119502a978 100644 --- a/src/character.c +++ b/src/character.c @@ -1,17 +1,18 @@ /* Basic character support. Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN. Licensed to the Free Software Foundation. - Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc. - Copyright (C) 2003, 2006 + Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H13PRO009 This file is part of GNU Emacs. -GNU Emacs is free software; you can redistribute it and/or modify +GNU Emacs is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. GNU Emacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -19,9 +20,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ +along with GNU Emacs. If not, see . */ /* At first, see the document in `character.h' to understand the code in this file. */ @@ -60,7 +59,7 @@ Lisp_Object Vauto_fill_chars; Lisp_Object Qauto_fill_chars; /* Char-table of information about which character to unify to which - Unicode character. */ + Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */ Lisp_Object Vchar_unify_table; /* A char-table. An element is non-nil iff the corresponding @@ -86,6 +85,8 @@ Lisp_Object Vscript_representative_chars; static Lisp_Object Qchar_script_table; +Lisp_Object Vunicode_category_table; + /* Mapping table from unibyte chars to multibyte chars. */ int unibyte_to_multibyte_table[256]; @@ -95,53 +96,67 @@ char unibyte_has_multibyte_table[256]; +/* If character code C has modifier masks, reflect them to the + character code if possible. Return the resulting code. */ + +int +char_resolve_modifier_mask (c) + int c; +{ + /* A non-ASCII character can't reflect modifier bits to the code. */ + if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) + return c; + + /* For Meta, Shift, and Control modifiers, we need special care. */ + if (c & CHAR_SHIFT) + { + /* Shift modifier is valid only with [A-Za-z]. */ + if ((c & 0377) >= 'A' && (c & 0377) <= 'Z') + c &= ~CHAR_SHIFT; + else if ((c & 0377) >= 'a' && (c & 0377) <= 'z') + c = (c & ~CHAR_SHIFT) - ('a' - 'A'); + /* Shift modifier for control characters and SPC is ignored. */ + else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20) + c &= ~CHAR_SHIFT; + } + if (c & CHAR_CTL) + { + /* Simulate the code in lread.c. */ + /* Allow `\C- ' and `\C-?'. */ + if ((c & 0377) == ' ') + c &= ~0177 & ~ CHAR_CTL; + else if ((c & 0377) == '?') + c = 0177 | (c & ~0177 & ~CHAR_CTL); + /* ASCII control chars are made from letters (both cases), + as well as the non-letters within 0100...0137. */ + else if ((c & 0137) >= 0101 && (c & 0137) <= 0132) + c &= (037 | (~0177 & ~CHAR_CTL)); + else if ((c & 0177) >= 0100 && (c & 0177) <= 0137) + c &= (037 | (~0177 & ~CHAR_CTL)); + } + if (c & CHAR_META) + { + /* Move the meta bit to the right place for a string. */ + c = (c & ~CHAR_META) | 0x80; + } + + return c; +} + + /* Store multibyte form of character C at P. If C has modifier bits, handle them appropriately. */ int char_string (c, p) - int c; + unsigned c; unsigned char *p; { int bytes; if (c & CHAR_MODIFIER_MASK) { - /* As an non-ASCII character can't have modifier bits, we just - ignore the bits. */ - if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) - { - /* For Meta, Shift, and Control modifiers, we need special care. */ - if (c & CHAR_META) - { - /* Move the meta bit to the right place for a string. */ - c = (c & ~CHAR_META) | 0x80; - } - if (c & CHAR_SHIFT) - { - /* Shift modifier is valid only with [A-Za-z]. */ - if ((c & 0377) >= 'A' && (c & 0377) <= 'Z') - c &= ~CHAR_SHIFT; - else if ((c & 0377) >= 'a' && (c & 0377) <= 'z') - c = (c & ~CHAR_SHIFT) - ('a' - 'A'); - } - if (c & CHAR_CTL) - { - /* Simulate the code in lread.c. */ - /* Allow `\C- ' and `\C-?'. */ - if (c == (CHAR_CTL | ' ')) - c = 0; - else if (c == (CHAR_CTL | '?')) - c = 127; - /* ASCII control chars are made from letters (both cases), - as well as the non-letters within 0100...0137. */ - else if ((c & 0137) >= 0101 && (c & 0137) <= 0132) - c &= (037 | (~0177 & ~CHAR_CTL)); - else if ((c & 0177) >= 0100 && (c & 0177) <= 0137) - c &= (037 | (~0177 & ~CHAR_CTL)); - } - } - + c = (unsigned) char_resolve_modifier_mask ((int) c); /* If C still has any modifier bits, just ignore it. */ c &= ~CHAR_MODIFIER_MASK; } @@ -169,11 +184,13 @@ char_string (c, p) p[4] = (0x80 | (c & 0x3F)); bytes = 5; } - else + else if (c <= MAX_CHAR) { c = CHAR_TO_BYTE8 (c); bytes = BYTE8_STRING (c, p); } + else + error ("Invalid character: %d", c); return bytes; } @@ -309,9 +326,7 @@ DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0, DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte, Sunibyte_char_to_multibyte, 1, 1, 0, - doc: /* Convert the unibyte character CH to multibyte character. -The multibyte character is a result of decoding CH by -the current unibyte charset (see `unibyte-charset'). */) + doc: /* Convert the byte CH to multibyte character. */) (ch) Lisp_Object ch; { @@ -331,23 +346,30 @@ the current unibyte charset (see `unibyte-charset'). */) DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte, Smultibyte_char_to_unibyte, 1, 1, 0, - doc: /* Convert the multibyte character CH to unibyte character.\n\ -The unibyte character is a result of encoding CH by -the current primary charset (value of `charset-primary'). */) + doc: /* Convert the multibyte character CH to a byte. +If the multibyte character does not represent a byte, return -1. */) (ch) Lisp_Object ch; { - int c; + int cm; CHECK_CHARACTER (ch); - c = XFASTINT (ch); - c = CHAR_TO_BYTE8 (c); - return make_number (c); + cm = XFASTINT (ch); + if (cm < 256) + /* Can't distinguish a byte read from a unibyte buffer from + a latin1 char, so let's let it slide. */ + return ch; + else + { + int cu = CHAR_TO_BYTE_SAFE (cm); + return make_number (cu); + } } DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0, doc: /* Return 1 regardless of the argument CHAR. -This is now an obsolete function. We keep it just for backward compatibility. */) +This is now an obsolete function. We keep it just for backward compatibility. +usage: (char-bytes CHAR) */) (ch) Lisp_Object ch; { @@ -358,7 +380,8 @@ This is now an obsolete function. We keep it just for backward compatibility. DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0, doc: /* Return width of CHAR when displayed in the current buffer. The width is measured by how many columns it occupies on the screen. -Tab is taken to occupy `tab-width' columns. */) +Tab is taken to occupy `tab-width' columns. +usage: (char-width CHAR) */) (ch) Lisp_Object ch; { @@ -388,9 +411,7 @@ Tab is taken to occupy `tab-width' columns. */) respectively. */ int -c_string_width (str, len, precision, nchars, nbytes) - const unsigned char *str; - int precision, *nchars, *nbytes; +c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes) { int i = 0, i_byte = 0; int width = 0; @@ -535,7 +556,8 @@ Width is measured by how many columns it occupies on the screen. When calculating width of a multibyte character in STRING, only the base leading-code is considered; the validity of the following bytes is not checked. Tabs in STRING are always -taken to occupy `tab-width' columns. */) +taken to occupy `tab-width' columns. +usage: (string-width STRING) */) (str) Lisp_Object str; { @@ -548,7 +570,8 @@ taken to occupy `tab-width' columns. */) DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0, doc: /* Return the direction of CHAR. -The returned value is 0 for left-to-right and 1 for right-to-left. */) +The returned value is 0 for left-to-right and 1 for right-to-left. +usage: (char-direction CHAR) */) (ch) Lisp_Object ch; { @@ -565,10 +588,10 @@ The returned value is 0 for left-to-right and 1 for right-to-left. */) However, if the current buffer has enable-multibyte-characters = nil, we treat each byte as a character. */ -int +EMACS_INT chars_in_text (ptr, nbytes) const unsigned char *ptr; - int nbytes; + EMACS_INT nbytes; { /* current_buffer is null at early stages of Emacs initialization. */ if (current_buffer == 0 @@ -583,10 +606,10 @@ chars_in_text (ptr, nbytes) sequences while assuming that there's no invalid sequence. It ignores enable-multibyte-characters. */ -int +EMACS_INT multibyte_chars_in_text (ptr, nbytes) const unsigned char *ptr; - int nbytes; + EMACS_INT nbytes; { const unsigned char *endp = ptr + nbytes; int chars = 0; @@ -811,6 +834,39 @@ str_as_unibyte (str, bytes) return (to - str); } +/* Convert eight-bit chars in SRC (in multibyte form) to the + corresponding byte and store in DST. CHARS is the number of + characters in SRC. The value is the number of bytes stored in DST. + Usually, the value is the same as CHARS, but is less than it if SRC + contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1 + is nonzero, a Latin-1 character is accepted and converted to a byte + of that character code. + Note: Currently the arg ACCEPT_LATIN_1 is not used. */ + +EMACS_INT +str_to_unibyte (src, dst, chars, accept_latin_1) + const unsigned char *src; + unsigned char *dst; + EMACS_INT chars; + int accept_latin_1; +{ + EMACS_INT i; + + for (i = 0; i < chars; i++) + { + int c = STRING_CHAR_ADVANCE (src); + + if (CHAR_BYTE8_P (c)) + c = CHAR_TO_BYTE8 (c); + else if (! ASCII_CHAR_P (c) + && (! accept_latin_1 || c >= 0x100)) + return i; + *dst++ = c; + } + return i; +} + + int string_count_byte8 (string) Lisp_Object string; @@ -929,6 +985,107 @@ usage: (string &rest CHARACTERS) */) return make_string_from_bytes ((char *) buf, n, p - buf); } +DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0, + doc: /* Concatenate all the argument bytes and make the result a unibyte string. +usage: (unibyte-string &rest BYTES) */) + (n, args) + int n; + Lisp_Object *args; +{ + int i; + unsigned char *buf = (unsigned char *) alloca (n); + unsigned char *p = buf; + unsigned c; + + for (i = 0; i < n; i++) + { + CHECK_NATNUM (args[i]); + c = XFASTINT (args[i]); + if (c >= 256) + args_out_of_range_3 (args[i], make_number (0), make_number (255)); + *p++ = c; + } + + return make_string_from_bytes ((char *) buf, n, p - buf); +} + +DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers, + Schar_resolve_modifiers, 1, 1, 0, + doc: /* Resolve modifiers in the character CHAR. +The value is a character with modifiers resolved into the character +code. Unresolved modifiers are kept in the value. +usage: (char-resolve-modifiers CHAR) */) + (character) + Lisp_Object character; +{ + int c; + + CHECK_NUMBER (character); + c = XINT (character); + return make_number (char_resolve_modifier_mask (c)); +} + +DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0, + doc: /* Return a byte value of a character at point. +Optional 1st arg POSITION, if non-nil, is a position of a character to get +a byte value. +Optional 2nd arg STRING, if non-nil, is a string of which first +character is a target to get a byte value. In this case, POSITION, if +non-nil, is an index of a target character in the string. + +If the current buffer (or STRING) is multibyte, and the target +character is not ASCII nor 8-bit character, an error is signalled. */) + (position, string) + Lisp_Object position, string; +{ + int c; + EMACS_INT pos; + unsigned char *p; + + if (NILP (string)) + { + if (NILP (position)) + { + p = PT_ADDR; + } + else + { + CHECK_NUMBER_COERCE_MARKER (position); + if (XINT (position) < BEGV || XINT (position) >= ZV) + args_out_of_range_3 (position, make_number (BEGV), make_number (ZV)); + pos = XFASTINT (position); + p = CHAR_POS_ADDR (pos); + } + if (NILP (current_buffer->enable_multibyte_characters)) + return make_number (*p); + } + else + { + CHECK_STRING (string); + if (NILP (position)) + { + p = SDATA (string); + } + else + { + CHECK_NATNUM (position); + if (XINT (position) >= SCHARS (string)) + args_out_of_range (string, position); + pos = XFASTINT (position); + p = SDATA (string) + string_char_to_byte (string, pos); + } + if (! STRING_MULTIBYTE (string)) + return make_number (*p); + } + c = STRING_CHAR (p, 0); + if (CHAR_BYTE8_P (c)) + c = CHAR_TO_BYTE8 (c); + else if (! ASCII_CHAR_P (c)) + error ("Not an ASCII nor an 8-bit character: %d", c); + return make_number (c); +} + + void init_character_once () { @@ -954,6 +1111,9 @@ syms_of_character () defsubr (&Sstring_width); defsubr (&Schar_direction); defsubr (&Sstring); + defsubr (&Sunibyte_string); + defsubr (&Schar_resolve_modifiers); + defsubr (&Sget_byte); DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, doc: /* @@ -1004,8 +1164,22 @@ It has one extra slot whose value is a list of script symbols. */); Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil); DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars, - doc: /* Alist of scripts vs the representative characters. */); + doc: /* Alist of scripts vs the representative characters. +Each element is a cons (SCRIPT . CHARS), where SCRIPT is a script name symbol, +CHARS is a list or a vector of characters. +If it is a list, all characters in the list are necessary for supporting SCRIPT. +If it is a vector, one of the characters in the vector is necessary. +This variable is used to find a font for a specific script. */); Vscript_representative_chars = Qnil; + + DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table, + doc: /* Char table of Unicode's "General Category". +All Unicode characters have one of the following values (symbol): + Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, + Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn +See The Unicode Standard for the meaning of those values. */); + /* The correct char-table is setup in characters.el. */ + Vunicode_category_table = Qnil; } #endif /* emacs */