X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/87f673176c5c159a62833ea4df659da2e1cd10fe..fcdd45857005869fe14081dc783fe4ba6ac66d01:/src/charset.c diff --git a/src/charset.c b/src/charset.c index 52ba5f28c5..211de24ef8 100644 --- a/src/charset.c +++ b/src/charset.c @@ -1,6 +1,9 @@ /* Basic multilingual character support. - Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. - Licensed to the Free Software Foundation. + Copyright (C) 2001, 2002, 2003, 2004, 2005, + 2006 Free Software Foundation, Inc. + Copyright (C) 1995, 1997, 1998, 1999, 2000, 2001 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H14PRO021 This file is part of GNU Emacs. @@ -16,8 +19,8 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ +the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +Boston, MA 02110-1301, USA. */ /* At first, see the document in `charset.h' to understand the code in this file. */ @@ -34,6 +37,7 @@ Boston, MA 02111-1307, USA. */ #include "lisp.h" #include "buffer.h" #include "charset.h" +#include "composite.h" #include "coding.h" #include "disptab.h" @@ -43,17 +47,17 @@ Boston, MA 02111-1307, USA. */ #endif /* emacs */ -Lisp_Object Qcharset, Qascii; +Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic; Lisp_Object Qunknown; /* Declaration of special leading-codes. */ -int leading_code_private_11; /* for private DIMENSION1 of 1-column */ -int leading_code_private_12; /* for private DIMENSION1 of 2-column */ -int leading_code_private_21; /* for private DIMENSION2 of 1-column */ -int leading_code_private_22; /* for private DIMENSION2 of 2-column */ +EMACS_INT leading_code_private_11; /* for private DIMENSION1 of 1-column */ +EMACS_INT leading_code_private_12; /* for private DIMENSION1 of 2-column */ +EMACS_INT leading_code_private_21; /* for private DIMENSION2 of 1-column */ +EMACS_INT leading_code_private_22; /* for private DIMENSION2 of 2-column */ -/* Declaration of special charsets. */ -int charset_ascii; /* ASCII */ +/* Declaration of special charsets. The values are set by + Fsetup_special_charsets. */ int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */ int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */ int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */ @@ -61,6 +65,9 @@ int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */ int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */ int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */ int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */ +int charset_mule_unicode_0100_24ff; +int charset_mule_unicode_2500_33ff; +int charset_mule_unicode_e000_ffff; Lisp_Object Qcharset_table; @@ -96,7 +103,7 @@ unsigned char *_fetch_multibyte_char_p; int _fetch_multibyte_char_len; /* Offset to add to a non-ASCII value when inserting it. */ -int nonascii_insert_offset; +EMACS_INT nonascii_insert_offset; /* Translation table for converting non-ASCII unibyte characters to multibyte codes, or nil. */ @@ -105,84 +112,65 @@ Lisp_Object Vnonascii_translation_table; /* List of all possible generic characters. */ Lisp_Object Vgeneric_character_list; -#define min(X, Y) ((X) < (Y) ? (X) : (Y)) -#define max(X, Y) ((X) > (Y) ? (X) : (Y)) void invalid_character (c) int c; { - error ("Invalid character: 0%o, %d, 0x%x", c, c, c); + error ("Invalid character: %d, #o%o, #x%x", c, c, c); } -/* Parse a multibyte character string STR of length LENGTH (>= 2) set - BYTES to the length of actual multibyte sequence, CHARSET, C1, and - C2 to such values that MAKE_CHAR can make the multibyte character - from them. - - It is assumed that *STR is one of base leading codes and the - following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P. - - This macro should be called only from SPLIT_MULTIBYTE_SEQ. */ - -#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \ - do { \ - (bytes) = 1; \ - (charset) = (str)[0]; \ - if ((charset) >= LEADING_CODE_PRIVATE_11 \ - && (charset) <= LEADING_CODE_PRIVATE_22) \ - (charset) = (str)[(bytes)++]; \ - if ((bytes) < (length)) \ - { \ - (c1) = (str)[(bytes)++] & 0x7F; \ - if ((bytes) < (length)) \ - (c2) = (str)[(bytes)++] & 0x7F; \ - else \ - (c2) = -1; \ - } \ - else \ - (c1) = (c2) = -1; \ +/* Parse string STR of length LENGTH and fetch information of a + character at STR. Set BYTES to the byte length the character + occupies, CHARSET, C1, C2 to proper values of the character. */ + +#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \ + do { \ + (c1) = *(str); \ + (bytes) = BYTES_BY_CHAR_HEAD (c1); \ + if ((bytes) == 1) \ + (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \ + else if ((bytes) == 2) \ + { \ + if ((c1) == LEADING_CODE_8_BIT_CONTROL) \ + (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \ + else \ + (charset) = (c1), (c1) = (str)[1] & 0x7F; \ + } \ + else if ((bytes) == 3) \ + { \ + if ((c1) < LEADING_CODE_PRIVATE_11) \ + (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \ + else \ + (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \ + } \ + else \ + (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \ } while (0) -/* Parse string STR of length LENGTH and check if a multibyte - characters is at STR. Set BYTES to the actual length, CHARSET, C1, - C2 to proper values for that character. */ - -#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \ - do { \ - int i; \ - if (ASCII_BYTE_P ((str)[0])) \ - i = 1; \ - else \ - for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \ - if (i == 1) \ - (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \ - else \ - { \ - if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \ - i = BYTES_BY_CHAR_HEAD ((str)[0]); \ - SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \ - } \ - } while (0) - -/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */ +/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. + Note that this intentionally allows invalid components, such + as 0xA0 0xA0, because there exist many files that contain + such invalid byte sequences, especially in EUC-GB. */ #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \ - (charset == CHARSET_ASCII \ + ((charset) == CHARSET_ASCII \ ? ((c1) >= 0 && (c1) <= 0x7F) \ - : (CHARSET_DIMENSION (charset) == 1 \ - ? ((c1) >= 0x20 && (c1) <= 0x7F) \ - : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))) + : ((charset) == CHARSET_8_BIT_CONTROL \ + ? ((c1) >= 0x80 && (c1) <= 0x9F) \ + : ((charset) == CHARSET_8_BIT_GRAPHIC \ + ? ((c1) >= 0x80 && (c1) <= 0xFF) \ + : (CHARSET_DIMENSION (charset) == 1 \ + ? ((c1) >= 0x20 && (c1) <= 0x7F) \ + : ((c1) >= 0x20 && (c1) <= 0x7F \ + && (c2) >= 0x20 && (c2) <= 0x7F))))) /* Store multi-byte form of the character C in STR. The caller should allocate at least 4-byte area at STR in advance. Returns the length of the multi-byte form. If C is an invalid character code, - signal an error. - - Use macro `CHAR_STRING (C, STR)' instead of calling this function - directly if C can be an ASCII character. */ + return -1. */ int -char_to_string (c, str) +char_to_string_1 (c, str) int c; unsigned char *str; { @@ -192,7 +180,7 @@ char_to_string (c, str) { /* Multibyte character can't have a modifier bit. */ if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) - invalid_character (c); + return -1; /* For Meta, Shift, and Control modifiers, we need special care. */ if (c & CHAR_META) @@ -224,17 +212,25 @@ char_to_string (c, str) c &= (037 | (~0177 & ~CHAR_CTL)); } - /* If C still has any modifier bits, it is an invalid character. */ - if (c & CHAR_MODIFIER_MASK) - invalid_character (c); + /* If C still has any modifier bits, just ignore it. */ + c &= ~CHAR_MODIFIER_MASK; + } - *p++ = c; + if (SINGLE_BYTE_CHAR_P (c)) + { + if (ASCII_BYTE_P (c) || c >= 0xA0) + *p++ = c; + else + { + *p++ = LEADING_CODE_8_BIT_CONTROL; + *p++ = c + 0x20; + } } - else if (c < MAX_CHAR) + else if (CHAR_VALID_P (c, 0)) { int charset, c1, c2; - SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); + SPLIT_CHAR (c, charset, c1, c2); if (charset >= LEADING_CODE_EXT_11) *p++ = (charset < LEADING_CODE_EXT_12 @@ -245,8 +241,8 @@ char_to_string (c, str) ? LEADING_CODE_PRIVATE_21 : LEADING_CODE_PRIVATE_22))); *p++ = charset; - if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32) - invalid_character (c); + if ((c1 > 0 && c1 < 32) || (c2 > 0 && c2 < 32)) + return -1; if (c1) { *p++ = c1 | 0x80; @@ -254,10 +250,34 @@ char_to_string (c, str) *p++ = c2 | 0x80; } } + else + return -1; - return (p -str); + return (p - str); +} + + +/* Store multi-byte form of the character C in STR. The caller should + allocate at least 4-byte area at STR in advance. Returns the + length of the multi-byte form. If C is an invalid character code, + signal an error. + + Use macro `CHAR_STRING (C, STR)' instead of calling this function + directly if C can be an ASCII character. */ + +int +char_to_string (c, str) + int c; + unsigned char *str; +{ + int len; + len = char_to_string_1 (c, str); + if (len == -1) + invalid_character (c); + return len; } + /* Return the non-ASCII character corresponding to multi-byte form at STR of length LEN. If ACTUAL_LEN is not NULL, store the byte length of the multibyte form in *ACTUAL_LEN. @@ -322,14 +342,16 @@ int char_printable_p (c) int c; { - int charset, c1, c2, chars; + int charset, c1, c2; - if (SINGLE_BYTE_CHAR_P (c)) + if (ASCII_BYTE_P (c)) return 1; - if (c >= MAX_CHAR) + else if (SINGLE_BYTE_CHAR_P (c)) return 0; - - SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); + else if (c >= MAX_CHAR) + return 0; + + SPLIT_CHAR (c, charset, c1, c2); if (! CHARSET_DEFINED_P (charset)) return 0; if (CHARSET_CHARS (charset) == 94 @@ -363,7 +385,7 @@ translate_char (table, c, charset, c1, c2) SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2); dimension = CHARSET_DIMENSION (alt_charset); - if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0) + if ((dimension == 1 && alt_c1 > 0) || (dimension == 2 && alt_c2 > 0)) /* CH is not a generic character, just return it. */ return XFASTINT (ch); @@ -472,6 +494,11 @@ update_charset_table (charset_id, dimension, chars, width, direction, CHARSET_TABLE_ENTRY (charset) = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil); + if (NILP (long_name)) + long_name = short_name; + if (NILP (description)) + description = long_name; + /* Get byte length of multibyte form, base leading-code, and extended leading-code of the charset. See the comment under the title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */ @@ -479,7 +506,7 @@ update_charset_table (charset_id, dimension, chars, width, direction, if (charset < MIN_CHARSET_PRIVATE_DIMENSION1) { /* Official charset, it doesn't have an extended leading-code. */ - if (charset != CHARSET_ASCII) + if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC) bytes += 1; /* For a base leading-code. */ leading_code_base = charset; leading_code_ext = 0; @@ -497,10 +524,9 @@ update_charset_table (charset_id, dimension, chars, width, direction, ? LEADING_CODE_PRIVATE_21 : LEADING_CODE_PRIVATE_22))); leading_code_ext = charset; - } - - if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes) - error ("Invalid dimension for the charset-ID %d", charset); + if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes) + error ("Invalid dimension for the charset-ID %d", charset); + } CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id; CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes); @@ -548,9 +574,10 @@ update_charset_table (charset_id, dimension, chars, width, direction, = make_number (-1); } - if (charset != CHARSET_ASCII + if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC && charset < MIN_CHARSET_PRIVATE_DIMENSION1) { + bytes_by_char_head[leading_code_base] = bytes; width_by_char_head[leading_code_base] = XINT (width); /* Update table emacs_code_class. */ @@ -562,7 +589,8 @@ update_charset_table (charset_id, dimension, chars, width, direction, } /* Update table iso_charset_table. */ - if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0) + if (XINT (iso_final_char) >= 0 + && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0) ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset; } @@ -577,11 +605,15 @@ get_charset_id (charset_symbol) Lisp_Object val; int charset; - return ((SYMBOLP (charset_symbol) - && (val = Fget (charset_symbol, Qcharset), VECTORP (val)) - && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]), - CHARSET_VALID_P (charset))) - ? charset : -1); + /* This originally used a ?: operator, but reportedly the HP-UX + compiler version HP92453-01 A.10.32.22 miscompiles that. */ + if (SYMBOLP (charset_symbol) + && VECTORP (val = Fget (charset_symbol, Qcharset)) + && CHARSET_VALID_P (charset = + XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]))) + return charset; + else + return -1; } /* Return an identification number for a new private charset of @@ -595,17 +627,13 @@ get_new_private_charset_id (dimension, width) if (dimension == 1) { - if (width == 1) - from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12; - else - from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21; + from = LEADING_CODE_EXT_11; + to = LEADING_CODE_EXT_21; } else { - if (width == 1) - from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22; - else - from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1; + from = LEADING_CODE_EXT_21; + to = LEADING_CODE_EXT_MAX + 1; } for (charset = from; charset < to; charset++) @@ -615,43 +643,45 @@ get_new_private_charset_id (dimension, width) } DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0, - "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\ -If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\ - treated as a private charset.\n\ -INFO-VECTOR is a vector of the format:\n\ - [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\ - SHORT-NAME LONG-NAME DESCRIPTION]\n\ -The meanings of each elements is as follows:\n\ -DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\ -CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\ -WIDTH (integer) is the number of columns a character in the charset\n\ -occupies on the screen: one of 0, 1, and 2.\n\ -\n\ -DIRECTION (integer) is the rendering direction of characters in the\n\ -charset when rendering. If 0, render from left to right, else\n\ -render from right to left.\n\ -\n\ -ISO-FINAL-CHAR (character) is the final character of the\n\ -corresponding ISO 2022 charset.\n\ -\n\ -ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\ -while encoding to variants of ISO 2022 coding system, one of the\n\ -following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\ -\n\ -SHORT-NAME (string) is the short name to refer to the charset.\n\ -\n\ -LONG-NAME (string) is the long name to refer to the charset.\n\ -\n\ -DESCRIPTION (string) is the description string of the charset.") - (charset_id, charset_symbol, info_vector) + doc: /* Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR. +If CHARSET-ID is nil, it is decided automatically, which means CHARSET is + treated as a private charset. +INFO-VECTOR is a vector of the format: + [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE + SHORT-NAME LONG-NAME DESCRIPTION] +The meanings of each elements is as follows: +DIMENSION (integer) is the number of bytes to represent a character: 1 or 2. +CHARS (integer) is the number of characters in a dimension: 94 or 96. +WIDTH (integer) is the number of columns a character in the charset +occupies on the screen: one of 0, 1, and 2. + +DIRECTION (integer) is the rendering direction of characters in the +charset when rendering. If 0, render from left to right, else +render from right to left. + +ISO-FINAL-CHAR (character) is the final character of the +corresponding ISO 2022 charset. +It may be -1 if the charset is internal use only. + +ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked +while encoding to variants of ISO 2022 coding system, one of the +following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR). +It may be -1 if the charset is internal use only. + +SHORT-NAME (string) is the short name to refer to the charset. + +LONG-NAME (string) is the long name to refer to the charset. + +DESCRIPTION (string) is the description string of the charset. */) + (charset_id, charset_symbol, info_vector) Lisp_Object charset_id, charset_symbol, info_vector; { Lisp_Object *vec; if (!NILP (charset_id)) - CHECK_NUMBER (charset_id, 0); - CHECK_SYMBOL (charset_symbol, 1); - CHECK_VECTOR (info_vector, 2); + CHECK_NUMBER (charset_id); + CHECK_SYMBOL (charset_symbol); + CHECK_VECTOR (info_vector); if (! NILP (charset_id)) { @@ -667,20 +697,22 @@ DESCRIPTION (string) is the description string of the charset.") || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96) || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2) || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1) - || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~') - || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1) + || !INTEGERP (vec[4]) + || !(XINT (vec[4]) == -1 || (XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')) + || !INTEGERP (vec[5]) + || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1) || !STRINGP (vec[6]) || !STRINGP (vec[7]) || !STRINGP (vec[8])) error ("Invalid info-vector argument for defining charset %s", - XSYMBOL (charset_symbol)->name->data); + SDATA (SYMBOL_NAME (charset_symbol))); if (NILP (charset_id)) { charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2])); if (XINT (charset_id) == 0) error ("There's no room for a new private charset %s", - XSYMBOL (charset_symbol)->name->data); + SDATA (SYMBOL_NAME (charset_symbol))); } update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3], @@ -688,34 +720,35 @@ DESCRIPTION (string) is the description string of the charset.") Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id))); CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol; Vcharset_list = Fcons (charset_symbol, Vcharset_list); + Fupdate_coding_systems_internal (); return Qnil; } DEFUN ("generic-character-list", Fgeneric_character_list, Sgeneric_character_list, 0, 0, 0, - "Return a list of all possible generic characters.\n\ -It includes a generic character for a charset not yet defined.") - () + doc: /* Return a list of all possible generic characters. +It includes a generic character for a charset not yet defined. */) + () { return Vgeneric_character_list; } DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char, Sget_unused_iso_final_char, 2, 2, 0, - "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\ -DIMENSION is the number of bytes to represent a character: 1 or 2.\n\ -CHARS is the number of characters in a dimension: 94 or 96.\n\ -\n\ -This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\ -If there's no unused final char for the specified kind of charset,\n\ -return nil.") - (dimension, chars) + doc: /* Return an unused ISO's final char for a charset of DIMENSION and CHARS. +DIMENSION is the number of bytes to represent a character: 1 or 2. +CHARS is the number of characters in a dimension: 94 or 96. + +This final char is for private use, thus the range is `0' (48) .. `?' (63). +If there's no unused final char for the specified kind of charset, +return nil. */) + (dimension, chars) Lisp_Object dimension, chars; { int final_char; - CHECK_NUMBER (dimension, 0); - CHECK_NUMBER (chars, 1); + CHECK_NUMBER (dimension); + CHECK_NUMBER (chars); if (XINT (dimension) != 1 && XINT (dimension) != 2) error ("Invalid charset dimension %d, it should be 1 or 2", XINT (dimension)); @@ -732,17 +765,20 @@ return nil.") DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset, 4, 4, 0, - "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\ -CHARSET should be defined by `defined-charset' in advance.") - (dimension, chars, final_char, charset_symbol) - Lisp_Object dimension, chars, final_char, charset_symbol; + doc: /* Declare an equivalent charset for ISO-2022 decoding. + +On decoding by an ISO-2022 base coding system, when a charset +specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as +if CHARSET is designated instead. */) + (dimension, chars, final_char, charset) + Lisp_Object dimension, chars, final_char, charset; { - int charset; + int charset_id; - CHECK_NUMBER (dimension, 0); - CHECK_NUMBER (chars, 1); - CHECK_NUMBER (final_char, 2); - CHECK_SYMBOL (charset_symbol, 3); + CHECK_NUMBER (dimension); + CHECK_NUMBER (chars); + CHECK_NUMBER (final_char); + CHECK_SYMBOL (charset); if (XINT (dimension) != 1 && XINT (dimension) != 2) error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension)); @@ -750,98 +786,111 @@ CHARSET should be defined by `defined-charset' in advance.") error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars)); if (XINT (final_char) < '0' || XFASTINT (final_char) > '~') error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars)); - if ((charset = get_charset_id (charset_symbol)) < 0) - error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data); + if ((charset_id = get_charset_id (charset)) < 0) + error ("Invalid charset %s", SDATA (SYMBOL_NAME (charset))); - ISO_CHARSET_TABLE (dimension, chars, final_char) = charset; + ISO_CHARSET_TABLE (dimension, chars, final_char) = charset_id; return Qnil; } -/* Return number of different charsets in STR of length LEN. In - addition, for each found charset N, CHARSETS[N] is set 1. The - caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance. - It may lookup a translation table TABLE if supplied. +/* Return information about charsets in the text at PTR of NBYTES + bytes, which are NCHARS characters. The value is: - If MULTIBYTE is zero, do not check multibyte characters, i.e. if - any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any - 8-bit codes are found CHARSET[1] is set to 1. */ + 0: Each character is represented by one byte. This is always + true for unibyte text. + 1: No charsets other than ascii eight-bit-control, + eight-bit-graphic, and latin-1 are found. + 2: Otherwise. + + In addition, if CHARSETS is nonzero, for each found charset N, set + CHARSETS[N] to 1. For that, callers should allocate CHARSETS + (MAX_CHARSET + 1 elements) in advance. It may lookup a translation + table TABLE if supplied. For invalid charsets, set CHARSETS[1] to + 1 (note that there's no charset whose ID is 1). */ int -find_charset_in_str (str, len, charsets, table, multibyte) - unsigned char *str; - int len, *charsets; +find_charset_in_text (ptr, nchars, nbytes, charsets, table) + const unsigned char *ptr; + int nchars, nbytes, *charsets; Lisp_Object table; - int multibyte; { - register int num = 0; - - if (! multibyte) + if (nchars == nbytes) { - unsigned char *endp = str + len; - int maskbits = 0; - - while (str < endp && maskbits != 3) - maskbits |= (*str++ < 0x80 ? 1 : 2); - if (maskbits & 1) - { - charsets[0] = 1; - num++; - } - if (maskbits & 2) + if (charsets && nbytes > 0) { - charsets[1] = 1; - num++; - } - return num; - } + const unsigned char *endp = ptr + nbytes; + int maskbits = 0; - if (! CHAR_TABLE_P (table)) - table = Qnil; + while (ptr < endp && maskbits != 7) + { + maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4); + ptr++; + } - while (len > 0) + if (maskbits & 1) + charsets[CHARSET_ASCII] = 1; + if (maskbits & 2) + charsets[CHARSET_8_BIT_CONTROL] = 1; + if (maskbits & 4) + charsets[CHARSET_8_BIT_GRAPHIC] = 1; + } + return 0; + } + else { + int return_val = 1; int bytes, charset, c1, c2; - SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2); + if (! CHAR_TABLE_P (table)) + table = Qnil; - if (! NILP (table)) + while (nchars-- > 0) { - int c1 = translate_char (table, -1, charset, c1, c2); - if (c1 >= 0) - charset = CHAR_CHARSET (c1); - } + SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2); + ptr += bytes; - if (!charsets[charset]) - { - charsets[charset] = 1; - num += 1; + if (!CHARSET_DEFINED_P (charset)) + charset = 1; + else if (! NILP (table)) + { + int c = translate_char (table, -1, charset, c1, c2); + if (c >= 0) + charset = CHAR_CHARSET (c); + } + + if (return_val == 1 + && charset != CHARSET_ASCII + && charset != CHARSET_8_BIT_CONTROL + && charset != CHARSET_8_BIT_GRAPHIC + && charset != charset_latin_iso8859_1) + return_val = 2; + + if (charsets) + charsets[charset] = 1; + else if (return_val == 2) + break; } - str += bytes; - len -= bytes; + return return_val; } - return num; } DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region, 2, 3, 0, - "Return a list of charsets in the region between BEG and END.\n\ -BEG and END are buffer positions.\n\ -Optional arg TABLE if non-nil is a translation table to look up.\n\ -\n\ -If the region contains invalid multiybte characters,\n\ -`unknown' is included in the returned list.\n\ -\n\ -If the current buffer is unibyte, the returned list contains\n\ -`ascii' if any 7-bit characters are found,\n\ -and `unknown' if any 8-bit characters are found.") - (beg, end, table) + doc: /* Return a list of charsets in the region between BEG and END. +BEG and END are buffer positions. +Optional arg TABLE if non-nil is a translation table to look up. + +If the region contains invalid multibyte characters, +`unknown' is included in the returned list. + +If the current buffer is unibyte, the returned list may contain +only `ascii', `eight-bit-control', and `eight-bit-graphic'. */) + (beg, end, table) Lisp_Object beg, end, table; { int charsets[MAX_CHARSET + 1]; int from, from_byte, to, stop, stop_byte, i; Lisp_Object val; - int undefined; - int multibyte = !NILP (current_buffer->enable_multibyte_characters); validate_region (&beg, &end); from = XFASTINT (beg); @@ -860,8 +909,8 @@ and `unknown' if any 8-bit characters are found.") bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); while (1) { - find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte, - charsets, table, multibyte); + find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from, + stop_byte - from_byte, charsets, table); if (stop < to) { from = stop, from_byte = stop_byte; @@ -872,69 +921,60 @@ and `unknown' if any 8-bit characters are found.") } val = Qnil; - undefined = 0; - for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--) - if (charsets[i]) - { - if (CHARSET_DEFINED_P (i)) - val = Fcons (CHARSET_SYMBOL (i), val); - else - undefined = 1; - } - if (undefined) + if (charsets[1]) val = Fcons (Qunknown, val); + for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--) + if (charsets[i]) + val = Fcons (CHARSET_SYMBOL (i), val); + if (charsets[0]) + val = Fcons (Qascii, val); return val; } DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string, 1, 2, 0, - "Return a list of charsets in STR.\n\ -Optional arg TABLE if non-nil is a translation table to look up.\n\ -\n\ -If the region contains invalid multiybte characters,\n\ -`unknown' is included in the returned list.\n\ -\n\ -If STR is unibyte, the returned list contains\n\ -`ascii' if any 7-bit characters are found,\n\ -and `unknown' if any 8-bit characters are found.") - (str, table) + doc: /* Return a list of charsets in STR. +Optional arg TABLE if non-nil is a translation table to look up. + +If the string contains invalid multibyte characters, +`unknown' is included in the returned list. + +If STR is unibyte, the returned list may contain +only `ascii', `eight-bit-control', and `eight-bit-graphic'. */) + (str, table) Lisp_Object str, table; { int charsets[MAX_CHARSET + 1]; int i; Lisp_Object val; - int undefined; - int multibyte; - CHECK_STRING (str, 0); - multibyte = STRING_MULTIBYTE (str); + CHECK_STRING (str); bzero (charsets, (MAX_CHARSET + 1) * sizeof (int)); - find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)), - charsets, table, multibyte); + find_charset_in_text (SDATA (str), SCHARS (str), + SBYTES (str), charsets, table); + val = Qnil; - undefined = 0; - for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--) - if (charsets[i]) - { - if (CHARSET_DEFINED_P (i)) - val = Fcons (CHARSET_SYMBOL (i), val); - else - undefined = 1; - } - if (undefined) + if (charsets[1]) val = Fcons (Qunknown, val); + for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--) + if (charsets[i]) + val = Fcons (CHARSET_SYMBOL (i), val); + if (charsets[0]) + val = Fcons (Qascii, val); return val; } + DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, - "") - (charset, code1, code2) + doc: /* Return a character made from arguments. +Internal use only. */) + (charset, code1, code2) Lisp_Object charset, code1, code2; { int charset_id, c1, c2; - CHECK_NUMBER (charset, 0); + CHECK_NUMBER (charset); charset_id = XINT (charset); if (!CHARSET_DEFINED_P (charset_id)) error ("Invalid charset ID: %d", XINT (charset)); @@ -943,19 +983,41 @@ DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, c1 = 0; else { - CHECK_NUMBER (code1, 1); + CHECK_NUMBER (code1); c1 = XINT (code1); } if (NILP (code2)) c2 = 0; else { - CHECK_NUMBER (code2, 2); + CHECK_NUMBER (code2); c2 = XINT (code2); } - if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF) - error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2); + if (charset_id == CHARSET_ASCII) + { + if (c1 < 0 || c1 > 0x7F) + goto invalid_code_posints; + return make_number (c1); + } + else if (charset_id == CHARSET_8_BIT_CONTROL) + { + if (NILP (code1)) + c1 = 0x80; + else if (c1 < 0x80 || c1 > 0x9F) + goto invalid_code_posints; + return make_number (c1); + } + else if (charset_id == CHARSET_8_BIT_GRAPHIC) + { + if (NILP (code1)) + c1 = 0xA0; + else if (c1 < 0xA0 || c1 > 0xFF) + goto invalid_code_posints; + return make_number (c1); + } + else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF) + goto invalid_code_posints; c1 &= 0x7F; c2 &= 0x7F; if (c1 == 0 @@ -963,22 +1025,23 @@ DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0, : (c2 == 0 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20) : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2))) - error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2); - + goto invalid_code_posints; return make_number (MAKE_CHAR (charset_id, c1, c2)); + + invalid_code_posints: + error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2); } DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0, - "Return list of charset and one or two position-codes of CHAR.\n\ -If CHAR is invalid as a character code,\n\ -return a list of symbol `unknown' and CHAR.") - (ch) + doc: /* Return list of charset and one or two position-codes of CH. +If CH is invalid as a character code, +return a list of symbol `unknown' and CH. */) + (ch) Lisp_Object ch; { - Lisp_Object val; int c, charset, c1, c2; - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); c = XFASTINT (ch); if (!CHAR_VALID_P (c, 1)) return Fcons (Qunknown, Fcons (ch, Qnil)); @@ -990,69 +1053,48 @@ return a list of symbol `unknown' and CHAR.") } DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0, - "Return charset of CHAR.") - (ch) + doc: /* Return charset of CH. */) + (ch) Lisp_Object ch; { - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch))); } DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0, - "Return charset of a character in the current buffer at position POS.\n\ -If POS is nil, it defauls to the current point.\n\ -If POS is out of range, the value is nil.") - (pos) + doc: /* Return charset of a character in the current buffer at position POS. +If POS is nil, it defauls to the current point. +If POS is out of range, the value is nil. */) + (pos) Lisp_Object pos; { - register int pos_byte, bytes, charset, c1, c2; - register unsigned char *p; - - if (NILP (pos)) - pos_byte = PT_BYTE; - else if (MARKERP (pos)) - { - pos_byte = marker_byte_position (pos); - if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE) - return Qnil; - } - else - { - CHECK_NUMBER (pos, 0); - if (XINT (pos) < BEGV || XINT (pos) >= ZV) - return Qnil; - pos_byte = CHAR_TO_BYTE (XINT (pos)); - } - p = BYTE_POS_ADDR (pos_byte); - if (BASE_LEADING_CODE_P (*p)) - { - SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2); - if (charset < 0) - charset = 1; - } - else - charset = CHARSET_ASCII; + Lisp_Object ch; + int charset; + ch = Fchar_after (pos); + if (! INTEGERP (ch)) + return ch; + charset = CHAR_CHARSET (XINT (ch)); return CHARSET_SYMBOL (charset); } DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0, - "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\ -\n\ -ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\ -by their DIMENSION, CHARS, and FINAL-CHAR,\n\ -where as Emacs distinguishes them by charset symbol.\n\ -See the documentation of the function `charset-info' for the meanings of\n\ -DIMENSION, CHARS, and FINAL-CHAR.") - (dimension, chars, final_char) + doc: /* Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR. + +ISO 2022's designation sequence (escape sequence) distinguishes charsets +by their DIMENSION, CHARS, and FINAL-CHAR, +where as Emacs distinguishes them by charset symbol. +See the documentation of the function `charset-info' for the meanings of +DIMENSION, CHARS, and FINAL-CHAR. */) + (dimension, chars, final_char) Lisp_Object dimension, chars, final_char; { int charset; - CHECK_NUMBER (dimension, 0); - CHECK_NUMBER (chars, 1); - CHECK_NUMBER (final_char, 2); + CHECK_NUMBER (dimension); + CHECK_NUMBER (chars); + CHECK_NUMBER (final_char); if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0) return Qnil; @@ -1069,11 +1111,11 @@ char_valid_p (c, genericp) { int charset, c1, c2; - if (c < 0) + if (c < 0 || c >= MAX_CHAR) return 0; if (SINGLE_BYTE_CHAR_P (c)) return 1; - SPLIT_NON_ASCII_CHAR (c, charset, c1, c2); + SPLIT_CHAR (c, charset, c1, c2); if (genericp) { if (c1) @@ -1090,10 +1132,10 @@ char_valid_p (c, genericp) } DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0, - "Return t if OBJECT is a valid normal character.\n\ -If optional arg GENERICP is non-nil, also return t if OBJECT is\n\ -a valid generic character.") - (object, genericp) + doc: /* Return t if OBJECT is a valid normal character. +If optional arg GENERICP is non-nil, also return t if OBJECT is +a valid generic character. */) + (object, genericp) Lisp_Object object, genericp; { if (! NATNUMP (object)) @@ -1103,15 +1145,15 @@ a valid generic character.") DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte, Sunibyte_char_to_multibyte, 1, 1, 0, - "Convert the unibyte character CH to multibyte character.\n\ -The conversion is done based on `nonascii-translation-table' (which see)\n\ - or `nonascii-insert-offset' (which see).") - (ch) + doc: /* Convert the unibyte character CH to multibyte character. +The conversion is done based on `nonascii-translation-table' (which see) + or `nonascii-insert-offset' (which see). */) + (ch) Lisp_Object ch; { int c; - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); c = XINT (ch); if (c < 0 || c >= 0400) error ("Invalid unibyte character: %d", c); @@ -1123,15 +1165,15 @@ The conversion is done based on `nonascii-translation-table' (which see)\n\ DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte, Smultibyte_char_to_unibyte, 1, 1, 0, - "Convert the multibyte character CH to unibyte character.\n\ -The conversion is done based on `nonascii-translation-table' (which see)\n\ - or `nonascii-insert-offset' (which see).") - (ch) + doc: /* Convert the multibyte character CH to unibyte character. +The conversion is done based on `nonascii-translation-table' (which see) + or `nonascii-insert-offset' (which see). */) + (ch) Lisp_Object ch; { int c; - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); c = XINT (ch); if (! CHAR_VALID_P (c, 0)) error ("Invalid multibyte character: %d", c); @@ -1142,14 +1184,11 @@ The conversion is done based on `nonascii-translation-table' (which see)\n\ } DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0, - "Return 1 regardless of the argument CHAR.\n\ -This is now an obsolete function. We keep it just for backward compatibility.") - (ch) + doc: /* Return 1 regardless of the argument CH. */) + (ch) Lisp_Object ch; { - Lisp_Object val; - - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); return make_number (1); } @@ -1161,7 +1200,9 @@ char_bytes (c) { int charset; - if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1))) + if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1))) + return 1; + if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0) return 1; charset = CHAR_CHARSET (c); @@ -1187,16 +1228,17 @@ char_bytes (c) : 4)))) DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0, - "Return width of CHAR when displayed in the current buffer.\n\ -The width is measured by how many columns it occupies on the screen.") - (ch) - Lisp_Object ch; + doc: /* Return width of CH when displayed in the current buffer. +The width is measured by how many columns it occupies on the screen. +Tab is taken to occupy `tab-width' columns. */) + (ch) + Lisp_Object ch; { Lisp_Object val, disp; int c; struct Lisp_Char_Table *dp = buffer_display_table (); - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); c = XINT (ch); @@ -1225,80 +1267,186 @@ strwidth (str, len) unsigned char *str; int len; { - unsigned char *endp = str + len; + return c_string_width (str, len, -1, NULL, NULL); +} + +/* Return width of string STR of length LEN when displayed in the + current buffer. The width is measured by how many columns it + occupies on the screen. If PRECISION > 0, return the width of + longest substring that doesn't exceed PRECISION, and set number of + characters and bytes of the substring in *NCHARS and *NBYTES + respectively. */ + +int +c_string_width (str, len, precision, nchars, nbytes) + const unsigned char *str; + int len, precision, *nchars, *nbytes; +{ + int i = 0, i_byte = 0; int width = 0; + int chars; struct Lisp_Char_Table *dp = buffer_display_table (); - while (str < endp) + while (i_byte < len) { - Lisp_Object disp; - int thislen; - int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen); + int bytes, thiswidth; + Lisp_Object val; - /* Get the way the display table would display it. */ if (dp) - disp = DISP_CHAR_VECTOR (dp, c); + { + int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + + chars = 1; + val = DISP_CHAR_VECTOR (dp, c); + if (VECTORP (val)) + thiswidth = XVECTOR (val)->size; + else + thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]); + } else - disp = Qnil; + { + chars = 1; + PARSE_MULTIBYTE_SEQ (str + i_byte, len - i_byte, bytes); + thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]); + } + + if (precision > 0 + && (width + thiswidth > precision)) + { + *nchars = i; + *nbytes = i_byte; + return width; + } + i++; + i_byte += bytes; + width += thiswidth; + } + + if (precision > 0) + { + *nchars = i; + *nbytes = i_byte; + } + + return width; +} - if (VECTORP (disp)) - width += XVECTOR (disp)->size; +/* Return width of Lisp string STRING when displayed in the current + buffer. The width is measured by how many columns it occupies on + the screen while paying attention to compositions. If PRECISION > + 0, return the width of longest substring that doesn't exceed + PRECISION, and set number of characters and bytes of the substring + in *NCHARS and *NBYTES respectively. */ + +int +lisp_string_width (string, precision, nchars, nbytes) + Lisp_Object string; + int precision, *nchars, *nbytes; +{ + int len = SCHARS (string); + int len_byte = SBYTES (string); + /* This set multibyte to 0 even if STRING is multibyte when it + contains only ascii and eight-bit-graphic, but that's + intentional. */ + int multibyte = len < len_byte; + const unsigned char *str = SDATA (string); + int i = 0, i_byte = 0; + int width = 0; + struct Lisp_Char_Table *dp = buffer_display_table (); + + while (i < len) + { + int chars, bytes, thiswidth; + Lisp_Object val; + int cmp_id; + int ignore, end; + + if (find_composition (i, -1, &ignore, &end, &val, string) + && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string)) + >= 0)) + { + thiswidth = composition_table[cmp_id]->width; + chars = end - i; + bytes = string_char_to_byte (string, end) - i_byte; + } + else if (dp) + { + int c; + + if (multibyte) + c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + else + c = str[i_byte], bytes = 1; + chars = 1; + val = DISP_CHAR_VECTOR (dp, c); + if (VECTORP (val)) + thiswidth = XVECTOR (val)->size; + else + thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]); + } else - width += ONE_BYTE_CHAR_WIDTH (*str); + { + chars = 1; + if (multibyte) + PARSE_MULTIBYTE_SEQ (str + i_byte, len_byte - i_byte, bytes); + else + bytes = 1; + thiswidth = ONE_BYTE_CHAR_WIDTH (str[i_byte]); + } + + if (precision > 0 + && (width + thiswidth > precision)) + { + *nchars = i; + *nbytes = i_byte; + return width; + } + i += chars; + i_byte += bytes; + width += thiswidth; + } - str += thislen; + if (precision > 0) + { + *nchars = i; + *nbytes = i_byte; } + return width; } DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0, - "Return width of STRING when displayed in the current buffer.\n\ -Width is measured by how many columns it occupies on the screen.\n\ -When calculating width of a multibyte character in STRING,\n\ -only the base leading-code is considered; the validity of\n\ -the following bytes is not checked.") - (str) - Lisp_Object str; + doc: /* Return width of STRING when displayed in the current buffer. +Width is measured by how many columns it occupies on the screen. +When calculating width of a multibyte character in STRING, +only the base leading-code is considered; the validity of +the following bytes is not checked. Tabs in STRING are always +taken to occupy `tab-width' columns. */) + (string) + Lisp_Object string; { Lisp_Object val; - CHECK_STRING (str, 0); - XSETFASTINT (val, strwidth (XSTRING (str)->data, - STRING_BYTES (XSTRING (str)))); + CHECK_STRING (string); + XSETFASTINT (val, lisp_string_width (string, -1, NULL, NULL)); return val; } DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0, - "Return the direction of CHAR.\n\ -The returned value is 0 for left-to-right and 1 for right-to-left.") - (ch) + doc: /* Return the direction of CH. +The returned value is 0 for left-to-right and 1 for right-to-left. */) + (ch) Lisp_Object ch; { int charset; - CHECK_NUMBER (ch, 0); + CHECK_NUMBER (ch); charset = CHAR_CHARSET (XFASTINT (ch)); if (!CHARSET_DEFINED_P (charset)) invalid_character (XINT (ch)); return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX); } -DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0, - "Return number of characters between BEG and END.") - (beg, end) - Lisp_Object beg, end; -{ - int from, to; - - CHECK_NUMBER_COERCE_MARKER (beg, 0); - CHECK_NUMBER_COERCE_MARKER (end, 1); - - from = min (XFASTINT (beg), XFASTINT (end)); - to = max (XFASTINT (beg), XFASTINT (end)); - - return make_number (to - from); -} - /* Return the number of characters in the NBYTES bytes at PTR. This works by looking at the contents and checking for multibyte sequences. However, if the current buffer has enable-multibyte-characters = nil, @@ -1306,7 +1454,7 @@ DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0, int chars_in_text (ptr, nbytes) - unsigned char *ptr; + const unsigned char *ptr; int nbytes; { /* current_buffer is null at early stages of Emacs initialization. */ @@ -1323,10 +1471,10 @@ chars_in_text (ptr, nbytes) int multibyte_chars_in_text (ptr, nbytes) - unsigned char *ptr; + const unsigned char *ptr; int nbytes; { - unsigned char *endp; + const unsigned char *endp; int chars, bytes; endp = ptr + nbytes; @@ -1334,54 +1482,200 @@ multibyte_chars_in_text (ptr, nbytes) while (ptr < endp) { - if (BASE_LEADING_CODE_P (*ptr)) + PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes); + ptr += bytes; + chars++; + } + + return chars; +} + +/* Parse unibyte text at STR of LEN bytes as multibyte text, and + count the numbers of characters and bytes in it. On counting + bytes, pay attention to the fact that 8-bit characters in the range + 0x80..0x9F are represented by 2 bytes in multibyte text. */ +void +parse_str_as_multibyte (str, len, nchars, nbytes) + const unsigned char *str; + int len, *nchars, *nbytes; +{ + const unsigned char *endp = str + len; + int n, chars = 0, bytes = 0; + + while (str < endp) + { + if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n)) + str += n, bytes += n; + else + str++, bytes += 2; + chars++; + } + *nchars = chars; + *nbytes = bytes; + return; +} + +/* Arrange unibyte text at STR of NBYTES bytes as multibyte text. + It actually converts only 8-bit characters in the range 0x80..0x9F + that don't contruct multibyte characters to multibyte forms. If + NCHARS is nonzero, set *NCHARS to the number of characters in the + text. It is assured that we can use LEN bytes at STR as a work + area and that is enough. Return the number of bytes of the + resulting text. */ + +int +str_as_multibyte (str, len, nbytes, nchars) + unsigned char *str; + int len, nbytes, *nchars; +{ + unsigned char *p = str, *endp = str + nbytes; + unsigned char *to; + int chars = 0; + int n; + + while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n)) + p += n, chars++; + if (nchars) + *nchars = chars; + if (p == endp) + return nbytes; + + to = p; + nbytes = endp - p; + endp = str + len; + safe_bcopy (p, endp - nbytes, nbytes); + p = endp - nbytes; + while (p < endp) + { + if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n)) { - PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes); - ptr += bytes; - nbytes -= bytes; + while (n--) + *to++ = *p++; } else { - ptr++; - nbytes--; + *to++ = LEADING_CODE_8_BIT_CONTROL; + *to++ = *p++ + 0x20; } chars++; } + if (nchars) + *nchars = chars; + return (to - str); +} - return chars; +/* Parse unibyte string at STR of LEN bytes, and return the number of + bytes it may ocupy when converted to multibyte string by + `str_to_multibyte'. */ + +int +parse_str_to_multibyte (str, len) + unsigned char *str; + int len; +{ + unsigned char *endp = str + len; + int bytes; + + for (bytes = 0; str < endp; str++) + bytes += (*str < 0x80 || *str >= 0xA0) ? 1 : 2; + return bytes; } -DEFUN ("string", Fstring, Sstring, 1, MANY, 0, - "Concatenate all the argument characters and make the result a string.") - (n, args) +/* Convert unibyte text at STR of NBYTES bytes to multibyte text + that contains the same single-byte characters. It actually + converts all 8-bit characters to multibyte forms. It is assured + that we can use LEN bytes at STR as a work area and that is + enough. */ + +int +str_to_multibyte (str, len, bytes) + unsigned char *str; + int len, bytes; +{ + unsigned char *p = str, *endp = str + bytes; + unsigned char *to; + + while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++; + if (p == endp) + return bytes; + to = p; + bytes = endp - p; + endp = str + len; + safe_bcopy (p, endp - bytes, bytes); + p = endp - bytes; + while (p < endp) + { + if (*p < 0x80 || *p >= 0xA0) + *to++ = *p++; + else + *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20; + } + return (to - str); +} + +/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It + actually converts only 8-bit characters in the range 0x80..0x9F to + unibyte forms. */ + +int +str_as_unibyte (str, bytes) + unsigned char *str; + int bytes; +{ + unsigned char *p = str, *endp = str + bytes; + unsigned char *to = str; + + while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++; + to = p; + while (p < endp) + { + if (*p == LEADING_CODE_8_BIT_CONTROL) + *to++ = *(p + 1) - 0x20, p += 2; + else + *to++ = *p++; + } + return (to - str); +} + + +DEFUN ("string", Fstring, Sstring, 0, MANY, 0, + doc: /* Concatenate all the argument characters and make the result a string. +usage: (string &rest CHARACTERS) */) + (n, args) int n; Lisp_Object *args; { - int i; - unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n); - unsigned char *p = buf; - Lisp_Object val; - int c, multibyte_p = 0; + int i, bufsize; + unsigned char *buf, *p; + int c; + int multibyte = 0; + Lisp_Object ret; + USE_SAFE_ALLOCA; + + bufsize = MAX_MULTIBYTE_LENGTH * n; + SAFE_ALLOCA (buf, unsigned char *, bufsize); + p = buf; for (i = 0; i < n; i++) { - CHECK_NUMBER (args[i], 0); - c = XINT (args[i]); - p += CHAR_STRING (c, p); + CHECK_NUMBER (args[i]); + if (!multibyte && !SINGLE_BYTE_CHAR_P (XFASTINT (args[i]))) + multibyte = 1; + } - if (!SINGLE_BYTE_CHAR_P (c)) - multibyte_p = 1; + for (i = 0; i < n; i++) + { + c = XINT (args[i]); + if (multibyte) + p += CHAR_STRING (c, p); + else + *p++ = c; } - /* Here, we can't use make_string_from_bytes because of the byte - combining problem. Make a multibyte string if there is any - multibyte character in ARGS to make sure that `(string 2276)' - returns a multibyte string if running --unibyte. */ - if (multibyte_p) - val = make_multibyte_string (buf, n, p - buf); - else - val = make_unibyte_string (buf, p - buf); - return val; + ret = make_string_from_bytes (buf, n, p - buf); + SAFE_FREE (); + + return ret; } #endif /* emacs */ @@ -1400,8 +1694,8 @@ charset_id_internal (charset_name) } DEFUN ("setup-special-charsets", Fsetup_special_charsets, - Ssetup_special_charsets, 0, 0, 0, "Internal use only.") - () + Ssetup_special_charsets, 0, 0, 0, doc: /* Internal use only. */) + () { charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1"); charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978"); @@ -1410,6 +1704,12 @@ DEFUN ("setup-special-charsets", Fsetup_special_charsets, charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201"); charset_big5_1 = charset_id_internal ("chinese-big5-1"); charset_big5_2 = charset_id_internal ("chinese-big5-2"); + charset_mule_unicode_0100_24ff + = charset_id_internal ("mule-unicode-0100-24ff"); + charset_mule_unicode_2500_33ff + = charset_id_internal ("mule-unicode-2500-33ff"); + charset_mule_unicode_e000_ffff + = charset_id_internal ("mule-unicode-e000-ffff"); return Qnil; } @@ -1448,31 +1748,20 @@ init_charset_once () iso_charset_table [i][j][k] = -1; for (i = 0; i < 256; i++) - BYTES_BY_CHAR_HEAD (i) = 1; - for (i = MIN_CHARSET_OFFICIAL_DIMENSION1; - i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++) - BYTES_BY_CHAR_HEAD (i) = 2; - for (i = MIN_CHARSET_OFFICIAL_DIMENSION2; - i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++) - BYTES_BY_CHAR_HEAD (i) = 3; - BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3; - BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3; - BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4; - BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4; - /* The followings don't reflect the actual bytes, but just to tell - that it is a start of a multibyte character. */ - BYTES_BY_CHAR_HEAD (0x80) = 2; - BYTES_BY_CHAR_HEAD (0x9E) = 2; - BYTES_BY_CHAR_HEAD (0x9F) = 2; + bytes_by_char_head[i] = 1; + bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3; + bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3; + bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4; + bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4; for (i = 0; i < 128; i++) - WIDTH_BY_CHAR_HEAD (i) = 1; + width_by_char_head[i] = 1; for (; i < 256; i++) - WIDTH_BY_CHAR_HEAD (i) = 4; - WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1; - WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2; - WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1; - WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2; + width_by_char_head[i] = 4; + width_by_char_head[LEADING_CODE_PRIVATE_11] = 1; + width_by_char_head[LEADING_CODE_PRIVATE_12] = 2; + width_by_char_head[LEADING_CODE_PRIVATE_21] = 1; + width_by_char_head[LEADING_CODE_PRIVATE_22] = 2; { Lisp_Object val; @@ -1498,13 +1787,20 @@ init_charset_once () void syms_of_charset () { + Qcharset = intern ("charset"); + staticpro (&Qcharset); + Qascii = intern ("ascii"); staticpro (&Qascii); - Qcharset = intern ("charset"); - staticpro (&Qcharset); + Qeight_bit_control = intern ("eight-bit-control"); + staticpro (&Qeight_bit_control); + + Qeight_bit_graphic = intern ("eight-bit-graphic"); + staticpro (&Qeight_bit_graphic); - /* Define ASCII charset now. */ + /* Define special charsets ascii, eight-bit-control, and + eight-bit-graphic. */ update_charset_table (make_number (CHARSET_ASCII), make_number (1), make_number (94), make_number (1), @@ -1512,11 +1808,37 @@ syms_of_charset () make_number ('B'), make_number (0), build_string ("ASCII"), - build_string ("ASCII"), + Qnil, /* same as above */ build_string ("ASCII (ISO646 IRV)")); CHARSET_SYMBOL (CHARSET_ASCII) = Qascii; Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII)); + update_charset_table (make_number (CHARSET_8_BIT_CONTROL), + make_number (1), make_number (96), + make_number (4), + make_number (0), + make_number (-1), + make_number (-1), + build_string ("8-bit control code (0x80..0x9F)"), + Qnil, /* same as above */ + Qnil); /* same as above */ + CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control; + Fput (Qeight_bit_control, Qcharset, + CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL)); + + update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC), + make_number (1), make_number (96), + make_number (4), + make_number (0), + make_number (-1), + make_number (-1), + build_string ("8-bit graphic char (0xA0..0xFF)"), + Qnil, /* same as above */ + Qnil); /* same as above */ + CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic; + Fput (Qeight_bit_graphic, Qcharset, + CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC)); + Qauto_fill_chars = intern ("auto-fill-chars"); staticpro (&Qauto_fill_chars); Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0)); @@ -1539,63 +1861,69 @@ syms_of_charset () defsubr (&Schar_width); defsubr (&Sstring_width); defsubr (&Schar_direction); - defsubr (&Schars_in_region); defsubr (&Sstring); defsubr (&Ssetup_special_charsets); DEFVAR_LISP ("charset-list", &Vcharset_list, - "List of charsets ever defined."); - Vcharset_list = Fcons (Qascii, Qnil); + doc: /* List of charsets ever defined. */); + Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control, + Fcons (Qeight_bit_graphic, Qnil))); DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, - "Vector of cons cell of a symbol and translation table ever defined.\n\ -An ID of a translation table is an index of this vector."); + doc: /* Vector of cons cell of a symbol and translation table ever defined. +An ID of a translation table is an index of this vector. */); Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil); DEFVAR_INT ("leading-code-private-11", &leading_code_private_11, - "Leading-code of private TYPE9N charset of column-width 1."); + doc: /* Leading-code of private TYPE9N charset of column-width 1. */); leading_code_private_11 = LEADING_CODE_PRIVATE_11; DEFVAR_INT ("leading-code-private-12", &leading_code_private_12, - "Leading-code of private TYPE9N charset of column-width 2."); + doc: /* Leading-code of private TYPE9N charset of column-width 2. */); leading_code_private_12 = LEADING_CODE_PRIVATE_12; DEFVAR_INT ("leading-code-private-21", &leading_code_private_21, - "Leading-code of private TYPE9Nx9N charset of column-width 1."); + doc: /* Leading-code of private TYPE9Nx9N charset of column-width 1. */); leading_code_private_21 = LEADING_CODE_PRIVATE_21; DEFVAR_INT ("leading-code-private-22", &leading_code_private_22, - "Leading-code of private TYPE9Nx9N charset of column-width 2."); + doc: /* Leading-code of private TYPE9Nx9N charset of column-width 2. */); leading_code_private_22 = LEADING_CODE_PRIVATE_22; DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset, - "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\ -This is used for converting unibyte text to multibyte,\n\ -and for inserting character codes specified by number.\n\n\ -This serves to convert a Latin-1 or similar 8-bit character code\n\ -to the corresponding Emacs multibyte character code.\n\ -Typically the value should be (- (make-char CHARSET 0) 128),\n\ -for your choice of character set.\n\ -If `nonascii-translation-table' is non-nil, it overrides this variable."); + doc: /* Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte. +This is used for converting unibyte text to multibyte, +and for inserting character codes specified by number. + +This serves to convert a Latin-1 or similar 8-bit character code +to the corresponding Emacs multibyte character code. +Typically the value should be (- (make-char CHARSET 0) 128), +for your choice of character set. +If `nonascii-translation-table' is non-nil, it overrides this variable. */); nonascii_insert_offset = 0; DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table, - "Translation table to convert non-ASCII unibyte codes to multibyte.\n\ -This is used for converting unibyte text to multibyte,\n\ -and for inserting character codes specified by number.\n\n\ -Conversion is performed only when multibyte characters are enabled,\n\ -and it serves to convert a Latin-1 or similar 8-bit character code\n\ -to the corresponding Emacs character code.\n\n\ -If this is nil, `nonascii-insert-offset' is used instead.\n\ -See also the docstring of `make-translation-table'."); + doc: /* Translation table to convert non-ASCII unibyte codes to multibyte. +This is used for converting unibyte text to multibyte, +and for inserting character codes specified by number. + +Conversion is performed only when multibyte characters are enabled, +and it serves to convert a Latin-1 or similar 8-bit character code +to the corresponding Emacs character code. + +If this is nil, `nonascii-insert-offset' is used instead. +See also the docstring of `make-translation-table'. */); Vnonascii_translation_table = Qnil; DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars, - "A char-table for characters which invoke auto-filling.\n\ -Such characters has value t in this table."); + doc: /* A char-table for characters which invoke auto-filling. +Such characters have value t in this table. */); Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil); CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt); CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt); } #endif /* emacs */ + +/* arch-tag: 66a89b8d-4c28-47d3-9ca1-56f78440d69f + (do not change this comment) */