1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2005 Free Software Foundation, Inc.
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 /* Char-table of information about which character to unify to which
64 Lisp_Object Vchar_unify_table
;
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars
;
70 /* A char-table. An elemnent is a column-width of the corresponding
72 Lisp_Object Vchar_width_table
;
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table
;
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p
;
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table
;
84 static Lisp_Object Qchar_script_table
;
86 /* Mapping table from unibyte chars to multibyte chars. */
87 int unibyte_to_multibyte_table
[256];
89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
91 char unibyte_has_multibyte_table
[256];
95 /* Store multibyte form of character C at P. If C has modifier bits,
96 handle them appropriately. */
105 if (c
& CHAR_MODIFIER_MASK
)
107 /* As an non-ASCII character can't have modifier bits, we just
109 if (ASCII_CHAR_P ((c
& ~CHAR_MODIFIER_MASK
)))
111 /* For Meta, Shift, and Control modifiers, we need special care. */
114 /* Move the meta bit to the right place for a string. */
115 c
= (c
& ~CHAR_META
) | 0x80;
119 /* Shift modifier is valid only with [A-Za-z]. */
120 if ((c
& 0377) >= 'A' && (c
& 0377) <= 'Z')
122 else if ((c
& 0377) >= 'a' && (c
& 0377) <= 'z')
123 c
= (c
& ~CHAR_SHIFT
) - ('a' - 'A');
127 /* Simulate the code in lread.c. */
128 /* Allow `\C- ' and `\C-?'. */
129 if (c
== (CHAR_CTL
| ' '))
131 else if (c
== (CHAR_CTL
| '?'))
133 /* ASCII control chars are made from letters (both cases),
134 as well as the non-letters within 0100...0137. */
135 else if ((c
& 0137) >= 0101 && (c
& 0137) <= 0132)
136 c
&= (037 | (~0177 & ~CHAR_CTL
));
137 else if ((c
& 0177) >= 0100 && (c
& 0177) <= 0137)
138 c
&= (037 | (~0177 & ~CHAR_CTL
));
142 /* If C still has any modifier bits, just ignore it. */
143 c
&= ~CHAR_MODIFIER_MASK
;
146 MAYBE_UNIFY_CHAR (c
);
148 if (c
<= MAX_3_BYTE_CHAR
)
150 bytes
= CHAR_STRING (c
, p
);
152 else if (c
<= MAX_4_BYTE_CHAR
)
154 p
[0] = (0xF0 | (c
>> 18));
155 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
156 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
157 p
[3] = (0x80 | (c
& 0x3F));
160 else if (c
<= MAX_5_BYTE_CHAR
)
163 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
164 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
165 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
166 p
[4] = (0x80 | (c
& 0x3F));
171 c
= CHAR_TO_BYTE8 (c
);
172 bytes
= BYTE8_STRING (c
, p
);
179 /* Return a character whose multibyte form is at P. Set LEN is not
180 NULL, it must be a pointer to integer. In that case, set *LEN to
181 the byte length of the multibyte form. If ADVANCED is not NULL, is
182 must be a pointer to unsigned char. In that case, set *ADVANCED to
183 the ending address (i.e. the starting address of the next
184 character) of the multibyte form. */
187 string_char (p
, advanced
, len
)
188 const unsigned char *p
;
189 const unsigned char **advanced
;
193 const unsigned char *saved_p
= p
;
195 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
197 c
= STRING_CHAR_ADVANCE (p
);
199 else if (! (*p
& 0x08))
201 c
= ((((p
)[0] & 0xF) << 18)
202 | (((p
)[1] & 0x3F) << 12)
203 | (((p
)[2] & 0x3F) << 6)
209 c
= ((((p
)[1] & 0x3F) << 18)
210 | (((p
)[2] & 0x3F) << 12)
211 | (((p
)[3] & 0x3F) << 6)
216 MAYBE_UNIFY_CHAR (c
);
226 /* Translate character C by translation table TABLE. If C is
227 negative, translate a character specified by CHARSET and CODE. If
228 no translation is found in TABLE, return the untranslated
229 character. If TABLE is a list, elements are char tables. In this
230 case, translace C by all tables. */
233 translate_char (table
, c
)
237 if (CHAR_TABLE_P (table
))
241 ch
= CHAR_TABLE_REF (table
, c
);
247 for (; CONSP (table
); table
= XCDR (table
))
248 c
= translate_char (XCAR (table
), c
);
253 /* Convert the multibyte character C to unibyte 8-bit character based
254 on the current value of charset_unibyte. If dimension of
255 charset_unibyte is more than one, return (C & 0xFF).
257 The argument REV_TBL is now ignored. It will be removed in the
261 multibyte_char_to_unibyte (c
, rev_tbl
)
265 struct charset
*charset
;
268 if (CHAR_BYTE8_P (c
))
269 return CHAR_TO_BYTE8 (c
);
270 charset
= CHARSET_FROM_ID (charset_unibyte
);
271 c1
= ENCODE_CHAR (charset
, c
);
272 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
276 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
277 doc
: /* Return non-nil if OBJECT is a character. */)
279 Lisp_Object object
, ignore
;
281 return (CHARACTERP (object
) ? Qt
: Qnil
);
284 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
285 doc
: /* Return the character of the maximum code. */)
288 return make_number (MAX_CHAR
);
291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
292 Sunibyte_char_to_multibyte
, 1, 1, 0,
293 doc
: /* Convert the unibyte character CH to multibyte character.
294 The multibyte character is a result of decoding CH by
295 the current unibyte charset (see `unibyte-charset'). */)
300 struct charset
*charset
;
302 CHECK_CHARACTER (ch
);
305 error ("Invalid unibyte character: %d", c
);
306 charset
= CHARSET_FROM_ID (charset_unibyte
);
307 c
= DECODE_CHAR (charset
, c
);
309 c
= BYTE8_TO_CHAR (XFASTINT (ch
));
310 return make_number (c
);
313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
314 Smultibyte_char_to_unibyte
, 1, 1, 0,
315 doc
: /* Convert the multibyte character CH to unibyte character.\n\
316 The unibyte character is a result of encoding CH by
317 the current primary charset (value of `charset-primary'). */)
323 CHECK_CHARACTER (ch
);
325 c
= CHAR_TO_BYTE8 (c
);
326 return make_number (c
);
329 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
330 doc
: /* Return 1 regardless of the argument CHAR.
331 This is now an obsolete function. We keep it just for backward compatibility. */)
335 CHECK_CHARACTER (ch
);
336 return make_number (1);
339 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
340 doc
: /* Return width of CHAR when displayed in the current buffer.
341 The width is measured by how many columns it occupies on the screen.
342 Tab is taken to occupy `tab-width' columns. */)
348 struct Lisp_Char_Table
*dp
= buffer_display_table ();
350 CHECK_CHARACTER (ch
);
353 /* Get the way the display table would display it. */
354 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
357 width
= ASIZE (disp
);
359 width
= CHAR_WIDTH (c
);
361 return make_number (width
);
364 /* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
372 c_string_width (str
, len
, precision
, nchars
, nbytes
)
373 const unsigned char *str
;
374 int precision
, *nchars
, *nbytes
;
376 int i
= 0, i_byte
= 0;
378 struct Lisp_Char_Table
*dp
= buffer_display_table ();
382 int bytes
, thiswidth
;
384 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
388 val
= DISP_CHAR_VECTOR (dp
, c
);
390 thiswidth
= XVECTOR (val
)->size
;
392 thiswidth
= CHAR_WIDTH (c
);
396 thiswidth
= CHAR_WIDTH (c
);
400 && (width
+ thiswidth
> precision
))
420 /* Return width of string STR of length LEN when displayed in the
421 current buffer. The width is measured by how many columns it
422 occupies on the screen. */
429 return c_string_width (str
, len
, -1, NULL
, NULL
);
432 /* Return width of Lisp string STRING when displayed in the current
433 buffer. The width is measured by how many columns it occupies on
434 the screen while paying attention to compositions. If PRECISION >
435 0, return the width of longest substring that doesn't exceed
436 PRECISION, and set number of characters and bytes of the substring
437 in *NCHARS and *NBYTES respectively. */
440 lisp_string_width (string
, precision
, nchars
, nbytes
)
442 int precision
, *nchars
, *nbytes
;
444 int len
= SCHARS (string
);
445 unsigned char *str
= SDATA (string
);
446 int i
= 0, i_byte
= 0;
448 struct Lisp_Char_Table
*dp
= buffer_display_table ();
452 int chars
, bytes
, thiswidth
;
455 EMACS_INT ignore
, end
;
457 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
458 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
461 thiswidth
= composition_table
[cmp_id
]->width
;
463 bytes
= string_char_to_byte (string
, end
) - i_byte
;
467 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
470 val
= DISP_CHAR_VECTOR (dp
, c
);
472 thiswidth
= XVECTOR (val
)->size
;
474 thiswidth
= CHAR_WIDTH (c
);
478 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
481 thiswidth
= CHAR_WIDTH (c
);
485 && (width
+ thiswidth
> precision
))
505 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
506 doc
: /* Return width of STRING when displayed in the current buffer.
507 Width is measured by how many columns it occupies on the screen.
508 When calculating width of a multibyte character in STRING,
509 only the base leading-code is considered; the validity of
510 the following bytes is not checked. Tabs in STRING are always
511 taken to occupy `tab-width' columns. */)
518 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
522 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
523 doc
: /* Return the direction of CHAR.
524 The returned value is 0 for left-to-right and 1 for right-to-left. */)
530 CHECK_CHARACTER (ch
);
532 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
535 /* Return the number of characters in the NBYTES bytes at PTR.
536 This works by looking at the contents and checking for multibyte
537 sequences while assuming that there's no invalid sequence.
538 However, if the current buffer has enable-multibyte-characters =
539 nil, we treat each byte as a character. */
542 chars_in_text (ptr
, nbytes
)
543 const unsigned char *ptr
;
546 /* current_buffer is null at early stages of Emacs initialization. */
547 if (current_buffer
== 0
548 || NILP (current_buffer
->enable_multibyte_characters
))
551 return multibyte_chars_in_text (ptr
, nbytes
);
554 /* Return the number of characters in the NBYTES bytes at PTR.
555 This works by looking at the contents and checking for multibyte
556 sequences while assuming that there's no invalid sequence. It
557 ignores enable-multibyte-characters. */
560 multibyte_chars_in_text (ptr
, nbytes
)
561 const unsigned char *ptr
;
564 const unsigned char *endp
= ptr
+ nbytes
;
569 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
580 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
581 characters and bytes in it, and store them in *NCHARS and *NBYTES
582 respectively. On counting bytes, pay attention to that 8-bit
583 characters not constructing a valid multibyte sequence are
584 represented by 2-byte in a multibyte text. */
587 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
588 const unsigned char *str
;
589 int len
, *nchars
, *nbytes
;
591 const unsigned char *endp
= str
+ len
;
592 int n
, chars
= 0, bytes
= 0;
594 if (len
>= MAX_MULTIBYTE_LENGTH
)
596 const unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
597 while (str
< adjusted_endp
)
599 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
600 str
+= n
, bytes
+= n
;
608 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
609 str
+= n
, bytes
+= n
;
620 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
621 It actually converts only such 8-bit characters that don't contruct
622 a multibyte sequence to multibyte forms of Latin-1 characters. If
623 NCHARS is nonzero, set *NCHARS to the number of characters in the
624 text. It is assured that we can use LEN bytes at STR as a work
625 area and that is enough. Return the number of bytes of the
629 str_as_multibyte (str
, len
, nbytes
, nchars
)
631 int len
, nbytes
, *nchars
;
633 unsigned char *p
= str
, *endp
= str
+ nbytes
;
638 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
640 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
641 while (p
< adjusted_endp
642 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
645 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
655 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
658 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
660 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
661 while (p
< adjusted_endp
)
663 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
671 c
= BYTE8_TO_CHAR (c
);
672 to
+= CHAR_STRING (c
, to
);
679 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
687 c
= BYTE8_TO_CHAR (c
);
688 to
+= CHAR_STRING (c
, to
);
697 /* Parse unibyte string at STR of LEN bytes, and return the number of
698 bytes it may ocupy when converted to multibyte string by
699 `str_to_multibyte'. */
702 parse_str_to_multibyte (str
, len
)
706 unsigned char *endp
= str
+ len
;
709 for (bytes
= 0; str
< endp
; str
++)
710 bytes
+= (*str
< 0x80) ? 1 : 2;
715 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
716 that contains the same single-byte characters. It actually
717 converts all 8-bit characters to multibyte forms. It is assured
718 that we can use LEN bytes at STR as a work area and that is
722 str_to_multibyte (str
, len
, bytes
)
726 unsigned char *p
= str
, *endp
= str
+ bytes
;
729 while (p
< endp
&& *p
< 0x80) p
++;
735 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
742 c
= BYTE8_TO_CHAR (c
);
743 to
+= CHAR_STRING (c
, to
);
748 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
749 actually converts characters in the range 0x80..0xFF to
753 str_as_unibyte (str
, bytes
)
757 const unsigned char *p
= str
, *endp
= str
+ bytes
;
764 len
= BYTES_BY_CHAR_HEAD (c
);
765 if (CHAR_BYTE8_HEAD_P (c
))
769 to
= str
+ (p
- str
);
773 len
= BYTES_BY_CHAR_HEAD (c
);
774 if (CHAR_BYTE8_HEAD_P (c
))
776 c
= STRING_CHAR_ADVANCE (p
);
777 *to
++ = CHAR_TO_BYTE8 (c
);
781 while (len
--) *to
++ = *p
++;
788 string_count_byte8 (string
)
791 int multibyte
= STRING_MULTIBYTE (string
);
792 int nbytes
= SBYTES (string
);
793 unsigned char *p
= SDATA (string
);
794 unsigned char *pend
= p
+ nbytes
;
802 len
= BYTES_BY_CHAR_HEAD (c
);
804 if (CHAR_BYTE8_HEAD_P (c
))
819 string_escape_byte8 (string
)
822 int nchars
= SCHARS (string
);
823 int nbytes
= SBYTES (string
);
824 int multibyte
= STRING_MULTIBYTE (string
);
826 const unsigned char *src
, *src_end
;
831 if (multibyte
&& nchars
== nbytes
)
834 byte8_count
= string_count_byte8 (string
);
836 if (byte8_count
== 0)
840 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
841 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
842 nbytes
+ byte8_count
* 2);
844 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
845 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
847 src
= SDATA (string
);
848 src_end
= src
+ nbytes
;
851 while (src
< src_end
)
854 len
= BYTES_BY_CHAR_HEAD (c
);
856 if (CHAR_BYTE8_HEAD_P (c
))
858 c
= STRING_CHAR_ADVANCE (src
);
859 c
= CHAR_TO_BYTE8 (c
);
860 sprintf ((char *) dst
, "\\%03o", c
);
864 while (len
--) *dst
++ = *src
++;
867 while (src
< src_end
)
872 sprintf ((char *) dst
, "\\%03o", c
);
882 DEFUN ("string", Fstring
, Sstring
, 0, MANY
, 0,
884 Concatenate all the argument characters and make the result a string.
885 usage: (string &rest CHARACTERS) */)
891 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
892 unsigned char *p
= buf
;
895 for (i
= 0; i
< n
; i
++)
897 CHECK_CHARACTER (args
[i
]);
899 p
+= CHAR_STRING (c
, p
);
902 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
906 init_character_once ()
915 DEFSYM (Qcharacterp
, "characterp");
916 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
918 staticpro (&Vchar_unify_table
);
919 Vchar_unify_table
= Qnil
;
921 defsubr (&Smax_char
);
922 defsubr (&Scharacterp
);
923 defsubr (&Sunibyte_char_to_multibyte
);
924 defsubr (&Smultibyte_char_to_unibyte
);
925 defsubr (&Schar_bytes
);
926 defsubr (&Schar_width
);
927 defsubr (&Sstring_width
);
928 defsubr (&Schar_direction
);
931 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
933 Vector recording all translation tables ever defined.
934 Each element is a pair (SYMBOL . TABLE) relating the table to the
935 symbol naming it. The ID of a translation table is an index into this vector. */);
936 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
938 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
940 A char-table for characters which invoke auto-filling.
941 Such characters have value t in this table. */);
942 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
943 CHAR_TABLE_SET (Vauto_fill_chars
, ' ', Qt
);
944 CHAR_TABLE_SET (Vauto_fill_chars
, '\n', Qt
);
946 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
948 A char-table for width (columns) of each character. */);
949 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
950 char_table_set_range (Vchar_width_table
, 0x80, 0x9F, make_number (4));
951 char_table_set_range (Vchar_width_table
, MAX_5_BYTE_CHAR
+ 1, MAX_CHAR
,
954 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
955 doc
: /* A char-table for direction of each character. */);
956 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
958 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
959 doc
: /* A char-table for each printable character. */);
960 Vprintable_chars
= Fmake_char_table (Qnil
, Qnil
);
961 Fset_char_table_range (Vprintable_chars
,
962 Fcons (make_number (32), make_number (126)), Qt
);
963 Fset_char_table_range (Vprintable_chars
,
964 Fcons (make_number (160),
965 make_number (MAX_5_BYTE_CHAR
)), Qt
);
967 DEFVAR_LISP ("char-script-table", &Vchar_script_table
,
968 doc
: /* Char table of script symbols.
969 It has one extra slot whose value is a list of script symbols. */);
971 /* Intern this now in case it isn't already done.
972 Setting this variable twice is harmless.
973 But don't staticpro it here--that is done in alloc.c. */
974 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
975 DEFSYM (Qchar_script_table
, "char-script-table");
976 Fput (Qchar_script_table
, Qchar_table_extra_slots
, make_number (1));
977 Vchar_script_table
= Fmake_char_table (Qchar_script_table
, Qnil
);
982 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
983 (do not change this comment) */